From e72bbaf7d5fe36a55a0e5787e95cd215d3e96306 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 7 Jun 2024 20:12:51 -0400 Subject: [PATCH 01/69] bump (#3383) Co-authored-by: v-chen_data --- .github/workflows/code-quality.yaml | 2 +- .github/workflows/codeql-analysis.yml | 2 +- .github/workflows/coverage.yaml | 2 +- .github/workflows/daily.yaml | 4 ++-- .github/workflows/pr-cpu.yaml | 2 +- .github/workflows/pr-gpu.yaml | 6 +++--- .github/workflows/release.yaml | 2 +- .github/workflows/smoketest.yaml | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml index 338fa77a17..c35546f4ca 100644 --- a/.github/workflows/code-quality.yaml +++ b/.github/workflows/code-quality.yaml @@ -34,7 +34,7 @@ jobs: uses: actions/checkout@v3 with: repository: mosaicml/ci-testing - ref: v0.0.7 + ref: v0.0.8 path: ./ci-testing - uses: ./ci-testing/.github/actions/code-quality with: diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index a8a510bffb..0cb835fbde 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -45,7 +45,7 @@ jobs: uses: actions/checkout@v3 with: repository: mosaicml/ci-testing - ref: v0.0.7 + ref: v0.0.8 path: ./ci-testing - uses: ./ci-testing/.github/actions/codeql-analysis with: diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml index 1bdae1efb8..9432e8c6c9 100644 --- a/.github/workflows/coverage.yaml +++ b/.github/workflows/coverage.yaml @@ -16,7 +16,7 @@ jobs: uses: actions/checkout@v3 with: repository: mosaicml/ci-testing - ref: v0.0.7 + ref: v0.0.8 path: ./ci-testing - uses: ./ci-testing/.github/actions/coverage with: diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index 320c1a5fe6..6b67e857ec 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -14,7 +14,7 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }} jobs: daily-pytest-cpu: - uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.7 + uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.8 strategy: matrix: include: @@ -100,7 +100,7 @@ jobs: download-path: artifacts daily-pytest-gpu: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.7 + uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8 strategy: matrix: # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index f32a589160..1bdb383823 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -9,7 +9,7 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }} jobs: pytest-cpu: - uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.7 + uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.8 strategy: matrix: include: diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index 3cb434ca58..f056292a43 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -9,7 +9,7 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }} jobs: pytest-gpu-1: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.7 + uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8 strategy: matrix: include: @@ -35,7 +35,7 @@ jobs: mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }} pytest-gpu-2: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.7 + uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8 strategy: matrix: include: @@ -62,7 +62,7 @@ jobs: pytest-gpu-4: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.7 + uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8 strategy: matrix: include: diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index faabebc7ac..0b253ea87f 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -24,7 +24,7 @@ jobs: uses: actions/checkout@v3 with: repository: mosaicml/ci-testing - ref: v0.0.7 + ref: v0.0.8 path: ./ci-testing - uses: ./ci-testing/.github/actions/code-quality with: diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml index 08291b5c0e..e9c6316a8d 100644 --- a/.github/workflows/smoketest.yaml +++ b/.github/workflows/smoketest.yaml @@ -33,7 +33,7 @@ jobs: uses: actions/checkout@v3 with: repository: mosaicml/ci-testing - ref: v0.0.7 + ref: v0.0.8 path: ./ci-testing - uses: ./ci-testing/.github/actions/smoketest with: From 9c4b0ba2f899ed779017b7ed2d856348ceb43eb3 Mon Sep 17 00:00:00 2001 From: bigning Date: Fri, 7 Jun 2024 17:59:19 -0700 Subject: [PATCH 02/69] Fix backward compatibility caused by missing eval metrics class (#3385) * a * a' * a * a * a * a * a * a * a * Apply suggestions from code review Co-authored-by: Mihir Patel --------- Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> Co-authored-by: Mihir Patel --- composer/metrics/nlp.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py index 2f60f8d1c9..e6877292cf 100644 --- a/composer/metrics/nlp.py +++ b/composer/metrics/nlp.py @@ -178,3 +178,27 @@ def compute(self) -> Tensor: """Returns torch.exp() of the LanguageCrossEntropy.""" avg_loss = super().compute() return torch.exp(avg_loss) + + +# For backward compatibility +class InContextLearningMetric: + """InContextLearningMetric only exists for backwards compatibility of checkpoints that contain pickled metrics.""" + + def __init__(self): + raise RuntimeError( + f'This class only exists for maintaining backward compatibility for checkpoints that contain pickled metrics. Please instead use https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/README.md.', + ) + + def __getstate__(self): + return None + + def __setstate__(self, state): + pass + + +InContextLearningCodeEvalAccuracy = InContextLearningMetric +InContextLearningLMAccuracy = InContextLearningMetric +InContextLearningLMExpectedCalibrationError = InContextLearningMetric +InContextLearningMCExpectedCalibrationError = InContextLearningMetric +InContextLearningQAAccuracy = InContextLearningMetric +InContextLearningMultipleChoiceAccuracy = InContextLearningMetric From e85e7385544b6ef6de13beb50b76d21f731fa6f1 Mon Sep 17 00:00:00 2001 From: bigning Date: Fri, 7 Jun 2024 18:24:35 -0700 Subject: [PATCH 03/69] Bump version v0.23.2 (#3386) * a * bump --- composer/_version.py | 2 +- docker/README.md | 4 ++-- docker/build_matrix.yaml | 12 ++++++------ docker/generate_build_matrix.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/composer/_version.py b/composer/_version.py index a38b61a722..50d801763e 100644 --- a/composer/_version.py +++ b/composer/_version.py @@ -3,4 +3,4 @@ """The Composer Version.""" -__version__ = '0.24.0.dev0' +__version__ = '0.23.2' diff --git a/docker/README.md b/docker/README.md index 76128b6e92..05c97fe626 100644 --- a/docker/README.md +++ b/docker/README.md @@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the | Composer Version | CUDA Support | Docker Tag | |--------------------|----------------|----------------------------------------------------------------| -| 0.23.1 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.23.1` | -| 0.23.1 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.1_cpu` | +| 0.23.2 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.23.2` | +| 0.23.2 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.2_cpu` | **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 332d7deb5e..73074988b9 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -208,9 +208,9 @@ TORCHVISION_VERSION: 0.16.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 - COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.1 + COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.2 CUDA_VERSION: 12.1.1 - IMAGE_NAME: composer-0-23-1 + IMAGE_NAME: composer-0-23-2 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -231,15 +231,15 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/composer:0.23.1 + - mosaicml/composer:0.23.2 - mosaicml/composer:latest TARGET: composer_stage TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 - COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.1 + COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.2 CUDA_VERSION: '' - IMAGE_NAME: composer-0-23-1-cpu + IMAGE_NAME: composer-0-23-2-cpu MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' @@ -247,7 +247,7 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/composer:0.23.1_cpu + - mosaicml/composer:0.23.2_cpu - mosaicml/composer:latest_cpu TARGET: composer_stage TORCHVISION_VERSION: 0.18.1 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index f0398ed750..bf961a756c 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -231,7 +231,7 @@ def _main(): composer_entries = [] # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images - composer_versions = ['0.23.1'] # Only build images for the latest composer version + composer_versions = ['0.23.2'] # Only build images for the latest composer version composer_python_versions = [PRODUCTION_PYTHON_VERSION] # just build composer against the latest for product in itertools.product(composer_python_versions, composer_versions, cuda_options): From afa2e397b4a073fa1c99e3b5841013b9f8f74cb1 Mon Sep 17 00:00:00 2001 From: bigning Date: Fri, 7 Jun 2024 19:46:59 -0700 Subject: [PATCH 04/69] Restore dev version (#3388) * a * a --- composer/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/_version.py b/composer/_version.py index 50d801763e..a38b61a722 100644 --- a/composer/_version.py +++ b/composer/_version.py @@ -3,4 +3,4 @@ """The Composer Version.""" -__version__ = '0.23.2' +__version__ = '0.24.0.dev0' From 4cbb4a21aec7f8d4ed0b12417dadc7f335e383c7 Mon Sep 17 00:00:00 2001 From: Antoine Broyelle Date: Sun, 9 Jun 2024 20:36:53 +0100 Subject: [PATCH 05/69] Only requires `databricks-sdk` when inside the Databricks platform (#3389) --- composer/loggers/mlflow_logger.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py index f24c5f956f..92b3fc2657 100644 --- a/composer/loggers/mlflow_logger.py +++ b/composer/loggers/mlflow_logger.py @@ -88,7 +88,6 @@ def __init__( ) -> None: try: import mlflow - from databricks.sdk import WorkspaceClient from mlflow import MlflowClient except ImportError as e: raise MissingConditionalImportError( @@ -143,9 +142,19 @@ def __init__( DEFAULT_MLFLOW_EXPERIMENT_NAME, ) assert self.experiment_name is not None # type hint + if os.getenv('DATABRICKS_TOKEN') is not None and not self.experiment_name.startswith('/Users/'): + try: + from databricks.sdk import WorkspaceClient + except ImportError as e: + raise MissingConditionalImportError( + extra_deps_group='mlflow', + conda_package='databricks-sdk', + conda_channel='conda-forge', + ) from e databricks_username = WorkspaceClient().current_user.me().user_name or '' self.experiment_name = '/' + os.path.join('Users', databricks_username, self.experiment_name) + self._mlflow_client = MlflowClient(self.tracking_uri) # Set experiment env_exp_id = os.getenv( From 735aa6fa72a0d3799f74d6329e5d38167d35a54f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 9 Jun 2024 18:56:37 -0700 Subject: [PATCH 06/69] Update packaging requirement from <24.1,>=21.3.0 to >=21.3.0,<24.2 (#3392) Updates the requirements on [packaging](https://github.com/pypa/packaging) to permit the latest version. - [Release notes](https://github.com/pypa/packaging/releases) - [Changelog](https://github.com/pypa/packaging/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pypa/packaging/compare/21.3...24.1) --- updated-dependencies: - dependency-name: packaging dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0b40fe0c72..38aa4799b8 100644 --- a/setup.py +++ b/setup.py @@ -88,7 +88,7 @@ def package_files(prefix: str, directory: str, extension: str): 'coolname>=1.1.0,<3', 'tabulate==0.9.0', # for auto-generating tables 'py-cpuinfo>=8.0.0,<10', - 'packaging>=21.3.0,<24.1', + 'packaging>=21.3.0,<24.2', 'importlib-metadata>=5.0.0,<7', 'mosaicml-cli>=0.5.25,<0.7', ] From db1325a60f7839dd0e86f029e1ac49cc9a8a5dc4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 9 Jun 2024 19:27:14 -0700 Subject: [PATCH 07/69] Bump cryptography from 42.0.6 to 42.0.8 (#3391) Bumps [cryptography](https://github.com/pyca/cryptography) from 42.0.6 to 42.0.8. - [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pyca/cryptography/compare/42.0.6...42.0.8) --- updated-dependencies: - dependency-name: cryptography dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 38aa4799b8..2eba4e39e6 100644 --- a/setup.py +++ b/setup.py @@ -139,7 +139,7 @@ def package_files(prefix: str, directory: str, extension: str): 'GitPython==3.1.43', 'moto[s3]>=4.0.1,<5', 'mock-ssh-server==0.9.1', - 'cryptography==42.0.6', + 'cryptography==42.0.8', 'pytest-httpserver>=1.0.4,<1.1', 'setuptools<=59.5.0', 'pillow==9.3.0', # Matches the Pillow version listed in the Dockerfile From 7778fcf0f80666cd22c789470cf3365ee8e8c041 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 11 Jun 2024 14:25:56 -0400 Subject: [PATCH 08/69] Skip extra dataset state load (#3393) * fix edge case * fix --- composer/core/state.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/composer/core/state.py b/composer/core/state.py index 083b977811..0864b50aaf 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -707,8 +707,10 @@ def train_dataloader(self, train_dataloader: Optional[Union[Iterable, DataLoader train_dataloader (Iterable | DataLoader, optional): The dataloader. """ self._train_dataloader = train_dataloader - # Load dataset state from checkpoint when train_dataloader is set - if self.dataset_state: + # Load dataset state from checkpoint when train_dataloader is set. This occurs if + # dataset_state was loaded from checkpoint and train_dataloader has not already + # consumed dataset_state['train'] to resume. + if self.dataset_state is not None and self.dataset_state.get('train') is not None: dataset = self._dataset_of(self._train_dataloader) if hasattr(dataset, 'load_state_dict'): dataset.load_state_dict(self.dataset_state['train']) # pyright: ignore @@ -1278,14 +1280,14 @@ def _load_dataset_state(self, obj: dict[str, Any]) -> None: Args: obj (dict[str, Any]): The state to load. """ - self.dataset_state = obj - dataset = self._dataset_of(self.train_dataloader) if hasattr(dataset, 'load_state_dict'): dataset.load_state_dict(obj['train']) # pyright: ignore obj['train'] = None self.dataset_resumption['train'] = True + self.dataset_state = obj + def load_model_state( self, state_dict: dict[str, Any], From 919fe91557ee17c77cf09e9d405a1c4396ab869d Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 12 Jun 2024 11:16:26 -0400 Subject: [PATCH 09/69] Remove FSDP restriction from PyTorch 1.13 (#3395) * remove torch 113 * lint --- composer/distributed/dist_strategy.py | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/composer/distributed/dist_strategy.py b/composer/distributed/dist_strategy.py index 1cc1044a02..be81652881 100644 --- a/composer/distributed/dist_strategy.py +++ b/composer/distributed/dist_strategy.py @@ -328,36 +328,12 @@ def sync_hook(*args): mixed_precision = fsdp_config.mixed_precision keep_low_precision_grads = fsdp_config.keep_low_precision_grads - mixed_precision, param_dtype, _, _ = get_mixed_precision( + mixed_precision, _, _, _ = get_mixed_precision( precision, mixed_precision=mixed_precision, keep_low_precision_grads=keep_low_precision_grads, ) - # Note: FSDP does support the use of torch.float32 with sharding. - # They just never expected a user to pass in torch.float32 into mixed_precision as a param_dtype. - # See: https://github.com/pytorch/pytorch/issues/90584 - # The PR fixing this bug is merged into PyTorch, but it hasn't made its way into a release yet. - # Instead a user needs to pass in `None` as param_dtype to have the parameters as torch.float32. - # TODO: remove these checks when PyTorch has a release that includes the fix. - if sharding_map_key != 'NO_SHARD': - if ( - precision == Precision.AMP_FP16 and param_dtype not in [torch.float16, None] or - precision == Precision.AMP_BF16 and param_dtype not in [torch.bfloat16, None] - ): - raise ValueError( - f'FSDP in PyTorch 1.13 does not support precision `{precision}` with sharding strategy `{sharding_strategy}` ' - f'and param_dtype `{param_dtype}.` Consider using one of the predefined mixed_precision strategies ' - "(choose: `'FULL'`, `'DEFAULT'`, `'PURE'`)", - ) - - if param_dtype == torch.float32: - raise ValueError( - f'FSDP in PyTorch 1.13 does not support param_dtype `{param_dtype}` with sharding_strategy `{sharding_map_key}` ' - f'Consider using `amp` or `bf16` for precision or setting param_dtype in mixed_precision to `None` ' - f'with sharding strategy `{sharding_map_key}.`', - ) - process_group = None if fsdp_config.process_group is not None: process_group_dict = {'process_group': fsdp_config.process_group} From b07b82e5f815787fcb55d8643236ba456439090d Mon Sep 17 00:00:00 2001 From: Joe Early Date: Thu, 13 Jun 2024 18:29:33 +0100 Subject: [PATCH 10/69] Check for 'CUDA error: out of memory' with auto-microbatching (#3400) --- composer/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index c680d1d3d7..ba455cd78d 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -307,7 +307,7 @@ def _get_initial_device_train_microbatch_size( def _is_cuda_oom(e: RuntimeError): """Determines if error is CUDA Out of Memory and if auto_microbatching is enabled.""" - if 'CUDA out of memory' in str(e): + if any(s in str(e) for s in ['CUDA out of memory', 'CUDA error: out of memory']): return True # With batch_norm, large batch sizes sometimes result in cuDNN instead of Cuda OOMs. if 'cuDNN error: CUDNN_STATUS_NOT_SUPPORTED. This error may appear if you passed in a non-contiguous input.' in str( From 6298d76f216b533d60c2fd11db9a0f93851aebfc Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Thu, 13 Jun 2024 16:24:14 -0400 Subject: [PATCH 11/69] Add tokens to iterations (#3374) --- composer/core/callback.py | 28 +++++++++++++------------ composer/core/state.py | 2 +- composer/core/time.py | 32 +++++++++++++++++++++++++++++ composer/trainer/trainer.py | 32 ++++++++++++++++++++++------- tests/checkpoint/test_state_dict.py | 1 + tests/test_time.py | 10 +++++++-- 6 files changed, 82 insertions(+), 23 deletions(-) diff --git a/composer/core/callback.py b/composer/core/callback.py index fef48ca1b1..897cf5f733 100644 --- a/composer/core/callback.py +++ b/composer/core/callback.py @@ -273,19 +273,21 @@ def batch_end(self, state: State, logger: Logger) -> None: The following :attr:`.State.timestamp` member variables are incremented immediately before the :attr:`.Event.BATCH_END` event. - +------------------------------------+ - | :attr:`.Timestamp.batch` | - +------------------------------------+ - | :attr:`.Timestamp.batch_in_epoch` | - +------------------------------------+ - | :attr:`.Timestamp.sample` | - +------------------------------------+ - | :attr:`.Timestamp.sample_in_epoch` | - +------------------------------------+ - | :attr:`.Timestamp.token` | - +------------------------------------+ - | :attr:`.Timestamp.token_in_epoch` | - +------------------------------------+ + +--------------------------------------+ + | :attr:`.Timestamp.batch` | + +--------------------------------------+ + | :attr:`.Timestamp.batch_in_epoch` | + +--------------------------------------+ + | :attr:`.Timestamp.sample` | + +--------------------------------------+ + | :attr:`.Timestamp.sample_in_epoch` | + +--------------------------------------+ + | :attr:`.Timestamp.token` | + +--------------------------------------+ + | :attr:`.Timestamp.token_in_epoch` | + +--------------------------------------+ + | :attr:`.Timestamp.token_in_iteration`| + +--------------------------------------+ Args: state (State): The training state. diff --git a/composer/core/state.py b/composer/core/state.py index 0864b50aaf..fa4feaec75 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -766,7 +766,7 @@ def _iteration_length(self, iteration_length: Optional[Union[str, Time[int]]]): return if isinstance(iteration_length, str): iteration_length = ensure_time(iteration_length, TimeUnit.EPOCH) - if iteration_length.unit != TimeUnit.EPOCH: + if iteration_length.unit != TimeUnit.EPOCH and iteration_length.unit != TimeUnit.TOKEN: raise NotImplementedError(f'{iteration_length.unit} is not allowed as a unit for iteration_length.') self.__iteration_length = iteration_length diff --git a/composer/core/time.py b/composer/core/time.py index c21f377026..3916dd7659 100644 --- a/composer/core/time.py +++ b/composer/core/time.py @@ -473,6 +473,7 @@ class Timestamp(Serializable): sample (int | Time[int], optional): The sample. token (int | Time[int], optional): The token. epoch_in_iteration (int | Time[int], optional): The epoch in the iteration. + token_in_iteration (int | Time[int], optional): The token in the iteration. batch_in_epoch (int | Time[int], optional): The batch in the epoch. sample_in_epoch (int | Time[int], optional): The sample in the epoch. token_in_epoch (int | Time[int], optional): The token in the epoch. @@ -490,6 +491,7 @@ def __init__( sample: Union[int, Time[int]] = 0, token: Union[int, Time[int]] = 0, epoch_in_iteration: Union[int, Time[int]] = 0, + token_in_iteration: Union[int, Time[int]] = 0, batch_in_epoch: Union[int, Time[int]] = 0, sample_in_epoch: Union[int, Time[int]] = 0, token_in_epoch: Union[int, Time[int]] = 0, @@ -531,6 +533,14 @@ def __init__( )) self._epoch_in_iteration = epoch_in_iteration + token_in_iteration = Time.from_input(token_in_iteration, TimeUnit.TOKEN) + if token_in_iteration.unit != TimeUnit.TOKEN: + raise ValueError(( + f'The `token_in_iteration` argument has units of {token_in_iteration.unit}; ' + f'not {TimeUnit.TOKEN}.' + )) + self._token_in_iteration = token_in_iteration + batch_in_epoch = Time.from_input(batch_in_epoch, TimeUnit.BATCH) if batch_in_epoch.unit != TimeUnit.BATCH: raise ValueError( @@ -579,6 +589,7 @@ def state_dict(self) -> dict[str, Any]: 'sample': self.sample.value, 'token': self.token.value, 'epoch_in_iteration': self.epoch_in_iteration.value, + 'token_in_iteration': self.token_in_iteration.value, 'batch_in_epoch': self.batch_in_epoch.value, 'sample_in_epoch': self.sample_in_epoch.value, 'token_in_epoch': self.token_in_epoch.value, @@ -609,6 +620,8 @@ def load_state_dict(self, state: dict[str, Any]) -> None: self._iteration = Time(state['iteration'], TimeUnit.ITERATION) if 'epoch_in_iteration' in state: self._epoch_in_iteration = Time(state['epoch_in_iteration'], TimeUnit.EPOCH) + if 'token_in_iteration' in state: + self._token_in_iteration = Time(state['token_in_iteration'], TimeUnit.TOKEN) if 'iteration_wct' in state: self._iteration_wct = state['iteration_wct'] @@ -642,6 +655,11 @@ def epoch_in_iteration(self) -> Time[int]: """The epoch count in the current iteration (resets at 0 at the beginning of every iteration).""" return self._epoch_in_iteration + @property + def token_in_iteration(self) -> Time[int]: + """The token count in the current iteration (resets at 0 at the beginning of every iteration).""" + return self._token_in_iteration + @property def batch_in_epoch(self) -> Time[int]: """The batch count in the current epoch (resets at 0 at the beginning of every epoch).""" @@ -814,6 +832,7 @@ def to_next_batch( sample_in_epoch=self.sample_in_epoch + samples, token=self.token + tokens, token_in_epoch=self.token_in_epoch + tokens, + token_in_iteration=self.token_in_iteration + tokens, total_wct=self.total_wct + duration, iteration_wct=self.iteration_wct + duration, epoch_wct=self.epoch_wct + duration, @@ -822,6 +841,7 @@ def to_next_batch( def to_next_epoch( self, + tokens: Union[int, Time] = 0, duration: Optional[datetime.timedelta] = None, ): """Create a new :class:`.Timestamp`, advanced to the next epoch. @@ -841,6 +861,7 @@ def to_next_epoch( >>> timestamp.copy( ... epoch=timestamp.epoch + 1, ... epoch_in_iteration=timestamp.epoch_in_iteration + 1, + ... token_in_iteration=timestamp.token_in_iteration + tokens, ... batch_in_epoch=0, ... sample_in_epoch=0, ... token_in_epoch=0, @@ -851,12 +872,17 @@ def to_next_epoch( ... ) Timestamp(...) + Args: + tokens (int | Time, optional): The number of tokens trained in the batch. Defaults to 0. + duration (datetime.timedelta, optional): The duration to train the batch. + """ if duration is None: duration = datetime.timedelta(seconds=0) return self.copy( epoch=self.epoch + 1, epoch_in_iteration=self.epoch_in_iteration + 1, + token_in_iteration=self.token_in_iteration + tokens, batch_in_epoch=0, sample_in_epoch=0, token_in_epoch=0, @@ -886,6 +912,7 @@ def to_next_iteration( >>> timestamp.copy( ... iteration=timestamp.iteration + 1, ... epoch_in_iteration=0, + ... token_in_iteration=0, ... batch_in_epoch=0, ... sample_in_epoch=0, ... token_in_epoch=0, @@ -902,6 +929,7 @@ def to_next_iteration( return self.copy( iteration=self.iteration + 1, epoch_in_iteration=0, + token_in_iteration=0, batch_in_epoch=0, sample_in_epoch=0, token_in_epoch=0, @@ -919,6 +947,7 @@ def copy( sample: Optional[Union[int, Time[int]]] = None, token: Optional[Union[int, Time[int]]] = None, epoch_in_iteration: Optional[Union[int, Time[int]]] = None, + token_in_iteration: Optional[Union[int, Time[int]]] = None, batch_in_epoch: Optional[Union[int, Time[int]]] = None, sample_in_epoch: Optional[Union[int, Time[int]]] = None, token_in_epoch: Optional[Union[int, Time[int]]] = None, @@ -938,6 +967,7 @@ def copy( sample (int | Time[int], optional): The sample. token (int | Time[int], optional): The token. epoch_in_iteration (int | Time[int], optional): The epoch in the iteration. + token_in_iteration (int | Time[int], optional): The token in the iteration. batch_in_epoch (int | Time[int], optional): The batch in the epoch. sample_in_epoch (int | Time[int], optional): The sample in the epoch. token_in_epoch (int | Time[int], optional): The token in the epoch. @@ -957,6 +987,7 @@ def copy( sample=sample if sample is not None else self.sample, token=token if token is not None else self.token, epoch_in_iteration=epoch_in_iteration if epoch_in_iteration is not None else self.epoch_in_iteration, + token_in_iteration=token_in_iteration if token_in_iteration is not None else self.token_in_iteration, batch_in_epoch=batch_in_epoch if batch_in_epoch is not None else self.batch_in_epoch, sample_in_epoch=sample_in_epoch if sample_in_epoch is not None else self.sample_in_epoch, token_in_epoch=token_in_epoch if token_in_epoch is not None else self.token_in_epoch, @@ -975,6 +1006,7 @@ def __repr__(self) -> str: f'sample={int(self.sample)}, ' f'token={int(self.token)}, ' f'epoch_in_iteration={int(self.epoch_in_iteration)}, ' + f'token_in_iteration={int(self.token_in_iteration)}, ' f'batch_in_epoch={int(self.batch_in_epoch)}, ' f'sample_in_epoch={int(self.sample_in_epoch)}, ' f'token_in_epoch={int(self.token_in_epoch)}, ' diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index ba455cd78d..4447698beb 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -2610,10 +2610,24 @@ def _train_loop(self) -> None: self.engine.run_event(Event.BATCH_CHECKPOINT) - if self.state.timestamp >= self.state.max_duration: + if ( + self.state.timestamp >= self.state.max_duration or ( + self.state._iteration_length is not None and + self.state.timestamp.token_in_iteration.unit == self.state._iteration_length.unit and + self.state.timestamp.token_in_iteration >= self.state._iteration_length + ) + ): # If max_duration is specified in batches, samples, or tokens, and # and the max_duration is reached mid-epoch, then break out of the dataloader # to finish the epoch early and finish training. + + # Increment iteration + if ( + self.state._iteration_length is not None and + self.state.timestamp.token_in_iteration.unit == self.state._iteration_length.unit and + self.state.timestamp.token_in_iteration >= self.state._iteration_length + ): + self._increment_iteration() finished_epoch_early = True break @@ -2649,12 +2663,10 @@ def _train_loop(self) -> None: # Increment iteration if ( self.state._iteration_length is not None and - self.state.timestamp.epoch_in_iteration == self.state._iteration_length + self.state.timestamp.epoch_in_iteration.unit == self.state._iteration_length.unit and + self.state.timestamp.epoch_in_iteration >= self.state._iteration_length ): - self.state.previous_timestamp = self.state.timestamp - self.state.timestamp = self.state.timestamp.to_next_iteration() - self.engine.run_event(Event.ITERATION_END) - self.engine.run_event(Event.ITERATION_CHECKPOINT) + self._increment_iteration() # Log final time values self.logger.log_metrics({ @@ -3039,6 +3051,12 @@ def _train_microbatch( return microbatch_loss_dict + def _increment_iteration(self): + self.state.previous_timestamp = self.state.timestamp + self.state.timestamp = self.state.timestamp.to_next_iteration() + self.engine.run_event(Event.ITERATION_END) + self.engine.run_event(Event.ITERATION_CHECKPOINT) + def predict( self, dataloader: Union[DataLoader, DataSpec], @@ -3506,7 +3524,7 @@ def _eval_loop( outputs.append(v) else: outputs = self.state.outputs.cpu() - batch = DeviceCPU().batch_to_device(self.state.batch,) + batch = DeviceCPU().batch_to_device(self.state.batch) else: outputs = self.state.outputs batch = self.state.batch diff --git a/tests/checkpoint/test_state_dict.py b/tests/checkpoint/test_state_dict.py index ee53a36ff9..af0ca34961 100644 --- a/tests/checkpoint/test_state_dict.py +++ b/tests/checkpoint/test_state_dict.py @@ -568,6 +568,7 @@ def test_get_resumption_state_dict(): 'sample': 0, 'token': 0, 'epoch_in_iteration': 0, + 'token_in_iteration': 0, 'batch_in_epoch': 0, 'sample_in_epoch': 0, 'token_in_epoch': 0, diff --git a/tests/test_time.py b/tests/test_time.py index b5fad369d9..1545eaa3b1 100644 --- a/tests/test_time.py +++ b/tests/test_time.py @@ -151,7 +151,7 @@ def test_timestamp_to_next_batch_epoch_iteration(): # Step batch 0 in epoch 0 timestamp = timestamp.to_next_batch(10, 20, datetime.timedelta(seconds=5)) assert timestamp.batch == 1 - assert timestamp.batch_in_epoch == 1 + assert timestamp.token_in_iteration == 20 assert timestamp.batch_in_epoch == 1 assert timestamp.sample == 10 assert timestamp.sample_in_epoch == 10 @@ -163,9 +163,10 @@ def test_timestamp_to_next_batch_epoch_iteration(): assert timestamp.batch_wct == datetime.timedelta(seconds=5) # Finish epoch 0 - timestamp = timestamp.to_next_epoch(datetime.timedelta(seconds=5)) + timestamp = timestamp.to_next_epoch(duration=datetime.timedelta(seconds=5)) assert timestamp.epoch == 1 assert timestamp.batch == 1 + assert timestamp.token_in_iteration == 20 assert timestamp.batch_in_epoch == 0 assert timestamp.sample == 10 assert timestamp.sample_in_epoch == 0 @@ -181,6 +182,7 @@ def test_timestamp_to_next_batch_epoch_iteration(): assert timestamp.epoch == 1 assert timestamp.batch == 2 assert timestamp.epoch_in_iteration == 1 + assert timestamp.token_in_iteration == 20 assert timestamp.batch_in_epoch == 1 assert timestamp.sample == 15 assert timestamp.sample_in_epoch == 5 @@ -195,6 +197,7 @@ def test_timestamp_to_next_batch_epoch_iteration(): timestamp = timestamp.to_next_batch(5, 1, datetime.timedelta(seconds=10)) assert timestamp.epoch == 1 assert timestamp.batch == 3 + assert timestamp.token_in_iteration == 21 assert timestamp.batch_in_epoch == 2 assert timestamp.sample == 20 assert timestamp.sample_in_epoch == 10 @@ -210,6 +213,7 @@ def test_timestamp_to_next_batch_epoch_iteration(): assert timestamp.epoch == 2 assert timestamp.batch == 3 assert timestamp.epoch_in_iteration == 2 + assert timestamp.token_in_iteration == 21 assert timestamp.batch_in_epoch == 0 assert timestamp.sample == 20 assert timestamp.sample_in_epoch == 0 @@ -224,6 +228,7 @@ def test_timestamp_to_next_batch_epoch_iteration(): assert timestamp.epoch == 2 assert timestamp.batch == 4 assert timestamp.epoch_in_iteration == 2 + assert timestamp.token_in_iteration == 22 assert timestamp.batch_in_epoch == 1 assert timestamp.sample == 25 assert timestamp.sample_in_epoch == 5 @@ -240,6 +245,7 @@ def test_timestamp_to_next_batch_epoch_iteration(): assert timestamp.epoch == 2 assert timestamp.batch == 4 assert timestamp.epoch_in_iteration == 0 + assert timestamp.token_in_iteration == 0 assert timestamp.batch_in_epoch == 0 assert timestamp.sample == 25 assert timestamp.sample_in_epoch == 0 From 9500fd17d809d364af85911aefa92d621451399c Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Thu, 13 Jun 2024 17:56:05 -0700 Subject: [PATCH 12/69] Busy wait utils in dist (#3396) --- composer/utils/dist.py | 73 ++++++++++++++++++++++++++++++++++++++++ tests/utils/test_dist.py | 47 ++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) diff --git a/composer/utils/dist.py b/composer/utils/dist.py index 573e940bb9..95a95835f4 100644 --- a/composer/utils/dist.py +++ b/composer/utils/dist.py @@ -37,6 +37,8 @@ import logging import os import pickle +import random +import string import sys import time from contextlib import contextmanager @@ -627,6 +629,77 @@ def get_sampler( ) +def get_node_signal_file_name(rng: Optional[random.Random] = None) -> str: + """Returns a file name to use for a file based wait within a node. + + The file name will contain a randomly generated string to avoid conflicts. + Note: This file name will be the same on each node, so that it can be used for a file based wait. + + Returns: + str: The name of the file that will be created to signal the end of a node's training. + """ + if rng is None: + rng = random.Random() + + random_string = ''.join(rng.choices(string.ascii_letters + string.digits, k=6)) + node_rank = get_node_rank() + file_name_list = [f'._signal_file_node{node_rank}_{random_string}'] + dist.broadcast_object_list(file_name_list, src=0) + return file_name_list[0] + + +def write_signal_file(signal_file_name: str, dir_path: Optional[str] = None) -> str: + """Writes a signal file to the specified directory. + + This function creates a signal file in the specified directory. If the directory does + Note: Only local rank zero writes the signal file. All other ranks are expected to wait for the signal file. + + Args: + signal_file_name (str): The name of the signal file. + dir_path (str, optional): The full path to the directory in which to create the signal file. If ``None``, + the current working directory will be used. + """ + if dir_path is not None: + os.makedirs(dir_path, exist_ok=True) + + signal_file_path = os.path.join(dir_path or os.getcwd(), signal_file_name) + if get_local_rank() == 0: + with open(signal_file_path, 'w') as _f: + _f.write('local rank zero done') + + return signal_file_path + + +@contextmanager +def busy_wait_for_local_rank_zero(dir_path: Optional[str] = None): + """Busy waits for the signal file to be created by local rank zero. + + This function will wait for the signal file to be created by local rank zero. It will + check every 0.1 seconds for the existence of the file. + + Args: + dir_path (str, optional): The directory in which to look for the signal file. If ``None``, + the current working directory will be used. + """ + # Get unique file name + signal_file_name = get_node_signal_file_name() + + # All ranks yield execution to allow local rank zero to run the code it needs to + yield + + # Local rank zero writes the signal file, all other rank just get the expected path + signal_file_path = write_signal_file(signal_file_name=signal_file_name, dir_path=dir_path) + + # Wait for the signal file to be created by local rank zero + with local_rank_zero_download_and_wait(signal_file_path): + # Sync all ranks across nodes as busy wait only is within node + dist.barrier() + + # Remove the signal file + if get_local_rank() == 0: + os.remove(signal_file_path) + + @contextmanager def local_rank_zero_download_and_wait(expected_file_path: str): """Context manager to wait for a file to exist on all ranks except local rank zero. diff --git a/tests/utils/test_dist.py b/tests/utils/test_dist.py index 44aedecf3d..608e56e5d2 100644 --- a/tests/utils/test_dist.py +++ b/tests/utils/test_dist.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +import os +import time from unittest.mock import patch import pytest @@ -27,3 +29,48 @@ def test_run_local_rank_first_context_runs_properly(): # so dist is initialized here and this code should run without error with dist.run_local_rank_zero_first(): pass + + +@pytest.mark.world_size(2) +def test_get_node_signal_file_name(): + file_name = dist.get_node_signal_file_name() + gathered_file_names = dist.all_gather_object(file_name) + + assert len(gathered_file_names) == 2 + assert gathered_file_names[0] == gathered_file_names[1] + assert gathered_file_names[0] == file_name + assert file_name.startswith('._signal_file_node0_') + assert len(file_name) == len('._signal_file_node0_') + 6 + + +@pytest.mark.world_size(2) +def test_write_signal_file(tmp_path): + file_name = dist.get_node_signal_file_name() + file_path = os.path.join(tmp_path, file_name) + dist.write_signal_file(file_name, tmp_path) + + # tmp_path will be different on each rank, and only rank zero + # should have written a file + if dist.get_local_rank() == 0: + assert os.path.exists(file_path) + else: + assert not os.path.exists(file_path) + + +@pytest.mark.world_size(2) +def test_busy_wait_for_local_rank_zero(tmp_path): + gathered_tmp_path = dist.all_gather_object(tmp_path)[0] + + dist.barrier() + start_time = time.time() + assert os.listdir(gathered_tmp_path) == [] + with dist.busy_wait_for_local_rank_zero(gathered_tmp_path): + if dist.get_local_rank() == 0: + time.sleep(0.5) + + end_time = time.time() + total_time = end_time - start_time + gathered_times = dist.all_gather_object(total_time) + assert os.listdir(gathered_tmp_path) == [] + assert len(gathered_times) == 2 + assert abs(gathered_times[0] - gathered_times[1]) < 0.1 From a1c581d86157a1b3d20886ca1e1c3433ed922adc Mon Sep 17 00:00:00 2001 From: Chen Qian Date: Fri, 14 Jun 2024 13:17:45 -0700 Subject: [PATCH 13/69] Add buffering time to mlflow logger (#3401) * Add buffering time to mlflow logger * rename * change default and fix comments --- composer/loggers/mlflow_logger.py | 7 ++++++ tests/loggers/test_mlflow_logger.py | 37 +++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py index 92b3fc2657..9a64ef5d9d 100644 --- a/composer/loggers/mlflow_logger.py +++ b/composer/loggers/mlflow_logger.py @@ -66,6 +66,9 @@ class MLFlowLogger(LoggerDestination): resume (bool, optional): If ``True``, Composer will search for an existing run tagged with the `run_name` and resume it. If no existing run is found, a new run will be created. If ``False``, Composer will create a new run. (default: ``False``) + logging_buffer_seconds (int, optional): The amount of time, in seconds, that MLflow + waits before sending logs to the MLflow tracking server. Metrics/params/tags logged + within this buffer time will be grouped in batches before being sent to the backend. """ def __init__( @@ -85,6 +88,7 @@ def __init__( ignore_hyperparameters: Optional[list[str]] = None, run_group: Optional[str] = None, resume: bool = False, + logging_buffer_seconds: Optional[int] = 10, ) -> None: try: import mlflow @@ -116,6 +120,9 @@ def __init__( ) self.resume = resume + if logging_buffer_seconds: + os.environ['MLFLOW_ASYNC_LOGGING_BUFFERING_SECONDS'] = str(logging_buffer_seconds) + self._rank_zero_only = rank_zero_only self._last_flush_time = time.time() self._flush_interval = flush_interval diff --git a/tests/loggers/test_mlflow_logger.py b/tests/loggers/test_mlflow_logger.py index 61d52d8023..4fe221f52d 100644 --- a/tests/loggers/test_mlflow_logger.py +++ b/tests/loggers/test_mlflow_logger.py @@ -798,6 +798,43 @@ def test_rename_metrics(self, device, num_batches, tmp_path): assert not os.path.exists(metric_file) +def test_mlflow_logging_time_buffer(tmp_path): + mlflow = pytest.importorskip('mlflow') + if not hasattr(mlflow.environment_variables, 'MLFLOW_ASYNC_LOGGING_BUFFERING_SECONDS'): + pytest.skip('MLFlow {mlflow.__version__} does not support async logging buffer seconds.') + + with patch('mlflow.store.tracking.file_store.FileStore.log_batch') as mock_log_batch: + + mlflow_uri = tmp_path / Path('my-test-mlflow-uri') + experiment_name = 'mlflow_logging_test' + mock_state = MagicMock() + mock_logger = MagicMock() + + test_mlflow_logger = MLFlowLogger( + tracking_uri=mlflow_uri, + experiment_name=experiment_name, + log_system_metrics=True, + run_name='test_run', + logging_buffer_seconds=2, + ) + test_mlflow_logger.init(state=mock_state, logger=mock_logger) + test_mlflow_logger.log_hyperparameters({'name': 'test'}) + steps = 10 + for i in range(steps): + metrics = { + 'foo': i, + 'bar': i, + } + test_mlflow_logger.log_metrics(metrics, step=i) + test_mlflow_logger.post_close() + + # There will be 2 calls to `log_batch`, one from `start_run` with tags, and one from the metrics + # and hyperparameters logging. + assert mock_log_batch.call_count == 2 + assert len(mock_log_batch.call_args_list[0][1]['metrics']) == 0 + assert len(mock_log_batch.call_args_list[1][1]['metrics']) == 2 * steps + + def test_mlflow_resume_run(tmp_path): mlflow = pytest.importorskip('mlflow') From fffa33571e5d5c8bc9b3e8ecdf97f75d74ce39c8 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 14 Jun 2024 13:36:53 -0700 Subject: [PATCH 14/69] Update _patch_pytorch.py (#3402) --- composer/trainer/_patch_pytorch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py index 5e59849d45..6771c5db4b 100644 --- a/composer/trainer/_patch_pytorch.py +++ b/composer/trainer/_patch_pytorch.py @@ -933,6 +933,8 @@ def device_mesh__getitem__(self, mesh_dim_names: Union[str, tuple[str]]) -> 'Dev return submesh else: + from torch.distributed.device_mesh import _mesh_resources + def create_child_mesh( self, parent_mesh: 'DeviceMesh', submesh_dim_names: Tuple[str, ...], ) -> 'DeviceMesh': From 3e1396eb18b33ccb387408bf436c672f50466b69 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Mon, 17 Jun 2024 08:52:48 -0700 Subject: [PATCH 15/69] Add pynvml to mlflow dep group (#3404) --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 2eba4e39e6..5beb4d5136 100644 --- a/setup.py +++ b/setup.py @@ -225,6 +225,7 @@ def package_files(prefix: str, directory: str, extension: str): extra_deps['mlflow'] = [ 'mlflow>=2.11.1,<3.0', 'databricks-sdk==0.28.0', + 'pynvml>=11.5.0,<12', ] extra_deps['pandas'] = ['pandas>=2.0.0,<3.0'] From 0eb1eee6c0dc94426cd423903e9cb0d4dc89a5ad Mon Sep 17 00:00:00 2001 From: Jack Zhang <170473087+JackZ-db@users.noreply.github.com> Date: Mon, 17 Jun 2024 09:10:43 -0700 Subject: [PATCH 16/69] min/max flagging added to system_metrics_monitor with only non-redundant, necessary gpu metrics logged (#3373) * implemented min_max flag * fixed string parsing * refactoring compute_system_metrics for all_reduce * keep track of rank within dict * added compute_min_max * added flag for both min_max and all_logging * corrected min_max call with model_device * removing total bytes (always going ot be constant) * handled no gpu case in min_max flag * removed unnecessary imports, patched unit tests * fixed assert statement for with gpu case, world size 1 * case min_rank and max_rank as int to guarantee them working as indices * fixed indent issue from fixing font * made docs more concise and readable * fixing unexpected unindent * fixing unit test device * modifying device to equal model_device.type * reverting to device=model_device * setting device in unit test = 'gpu' * setting device = 'cuda' in unit testing * reverting to next(state.model.parameters()).device * removed torch as a dependecy for unit_testing * cleaned up UI to be consistent + removed calling next to obtain device --------- Co-authored-by: Mihir Patel Co-authored-by: Charles Tang --- composer/callbacks/system_metrics_monitor.py | 93 ++++++++++++++++--- .../callbacks/test_system_metrics_monitor.py | 4 +- 2 files changed, 81 insertions(+), 16 deletions(-) diff --git a/composer/callbacks/system_metrics_monitor.py b/composer/callbacks/system_metrics_monitor.py index 292e31e57b..bdd2cebce2 100644 --- a/composer/callbacks/system_metrics_monitor.py +++ b/composer/callbacks/system_metrics_monitor.py @@ -9,6 +9,7 @@ import os import psutil +import torch from composer.core import Callback, Event, State from composer.loggers import Logger @@ -19,13 +20,52 @@ __all__ = ['SystemMetricsMonitor'] +_GPU_METRICS = [ + 'gpu_percentage', + 'memory_percentage', + 'gpu_temperature_C', + 'gpu_power_usage_W', +] + class SystemMetricsMonitor(Callback): - """Track system metrics.""" + """Logs GPU/CPU metrics. + + GPU Metrics: + gpu_percentage: Occupancy rate, percent of time over sampling period during which one or more kernels was executing on the GPU. + memory_percentage: Percent of time over sampling period during which global memory was being read or written. + gpu_temperature_C: Temperature of device, in Celcius. + gpu_power_usage_W: Power usage of device, in Watts. + + By default, only the maximum and minimum values for these metrics, alongside their respective ranks in the key names, + are logged on the :attr:`.Event.BATCH_START`, :attr:`.Event.EVAL_BATCH_START`, :attr:`.Event.PREDICT_BATCH_START` + events for every batch. If log_all_data is set to True, all values for these metrics across all ranks are logged on the + above events for every batch. + + Example: + .. doctest:: - def __init__(self, gpu_available: bool = False) -> None: + >>> from composer import Trainer + >>> from composer.callbacks import SystemMetricsMonitor + >>> # constructing trainer object with this callback + >>> trainer = Trainer( + ... model=model, + ... train_dataloader=train_dataloader, + ... eval_dataloader=eval_dataloader, + ... optimizers=optimizer, + ... max_duration='1ep', + ... callbacks=[SystemMetricsMonitor()], + ... ) + + Args: + log_all_data (bool, optional): True if user wants to log data for all ranks, not just the min/max. + Defaults to False. + """ + + def __init__(self, log_all_data: bool = False) -> None: super().__init__() - self.gpu_available = gpu_available + self.gpu_available = torch.cuda.is_available() + self.log_all_data = log_all_data if self.gpu_available: try: import pynvml @@ -46,9 +86,23 @@ def run_event(self, event: Event, state: State, logger: Logger): ]: local_node_system_metrics = self.compute_system_metrics() all_system_metrics = dist.all_gather_object(local_node_system_metrics) - system_metrics = { - key: value for local_metrics in all_system_metrics for key, value in local_metrics.items() - } + system_metrics = {} + + if self.log_all_data: + for rank, metrics in enumerate(all_system_metrics): + for key, value in metrics.items(): + if key in _GPU_METRICS: + system_metrics[f'{key}_rank_{rank}'] = value + else: + system_metrics[key] = value + + else: + system_metrics = self.compute_gpu_min_max_metrics(all_system_metrics, state) + for rank, metrics in enumerate(all_system_metrics): + for key, value in metrics.items(): + if key not in _GPU_METRICS: + system_metrics[key] = value + logger.log_metrics(system_metrics) def compute_system_metrics(self): @@ -58,17 +112,14 @@ def compute_system_metrics(self): if self.gpu_available: import pynvml local_rank = dist.get_local_rank() - global_rank = dist.get_global_rank() handle = pynvml.nvmlDeviceGetHandleByIndex(local_rank) - memory = pynvml.nvmlDeviceGetMemoryInfo(handle) - system_metrics[f'device{global_rank}_memory_total'] = memory.total - system_metrics[f'device{global_rank}_memory_free'] = memory.free - system_metrics[f'device{global_rank}_memory_used'] = memory.used device_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) - system_metrics[f'device{global_rank}_gpu_percentage'] = device_utilization.gpu - system_metrics[f'device{global_rank}_memory_percentage'] = device_utilization.memory + system_metrics['gpu_percentage'] = device_utilization.gpu + system_metrics['memory_percentage'] = device_utilization.memory temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) - system_metrics[f'device{global_rank}_gpu_temperature'] = temperature + system_metrics['gpu_temperature_C'] = temperature + power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0 # convert from mW to W + system_metrics['gpu_power_usage_W'] = power # Get metrics for the system cpu_percent = psutil.cpu_percent() @@ -83,3 +134,17 @@ def compute_system_metrics(self): for k, v in network_usage.items(): system_metrics[f'network_{k}'] = v return system_metrics + + def compute_gpu_min_max_metrics(self, all_metrics, state): + min_max_metrics = {} + + if self.gpu_available: + for key in _GPU_METRICS: + values = torch.tensor([metrics_for_cur_rank[key] for metrics_for_cur_rank in all_metrics]) + values = state.device.tensor_to_device(values) + min_rank = int(torch.argmin(values).item()) + max_rank = int(torch.argmax(values).item()) + min_max_metrics[f'min_{key}_rank_{min_rank}'] = values[min_rank].item() + min_max_metrics[f'max_{key}_rank_{max_rank}'] = values[max_rank].item() + + return min_max_metrics diff --git a/tests/callbacks/test_system_metrics_monitor.py b/tests/callbacks/test_system_metrics_monitor.py index a26d02ba93..c974f6cbed 100644 --- a/tests/callbacks/test_system_metrics_monitor.py +++ b/tests/callbacks/test_system_metrics_monitor.py @@ -13,7 +13,7 @@ @pytest.mark.gpu def test_system_metrics_monitor_gpu(): # Construct the trainer - system_metrics_monitor = SystemMetricsMonitor(gpu_available=True) + system_metrics_monitor = SystemMetricsMonitor() in_memory_logger = InMemoryLogger() trainer = Trainer( model=SimpleModel(), @@ -24,7 +24,7 @@ def test_system_metrics_monitor_gpu(): ) trainer.fit() - assert 'device0_gpu_percentage' in in_memory_logger.data + assert 'min_gpu_percentage_rank_0' in in_memory_logger.data assert 'cpu_percentage' in in_memory_logger.data From 0ee83f78db4d645819fed51b91a69aee6c8fc2df Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Mon, 17 Jun 2024 10:57:49 -0700 Subject: [PATCH 17/69] simplify launcher (#3398) --- composer/cli/launcher.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/composer/cli/launcher.py b/composer/cli/launcher.py index 08dd7b3921..91110c2add 100755 --- a/composer/cli/launcher.py +++ b/composer/cli/launcher.py @@ -197,8 +197,13 @@ def _parse_args(): if args.nproc < 1: raise ValueError('The nproc must be 1 or greater') - if args.world_size is None and 'WORLD_SIZE' in os.environ: - args.world_size = int(os.environ['WORLD_SIZE']) + if args.world_size is None: + if 'WORLD_SIZE' in os.environ and os.environ.get('LOCAL_WORLD_SIZE') != os.environ['WORLD_SIZE']: + # Use WORLD_SIZE env var if set and running multinode. Otherwise, default to nproc + # to enable easy overriding of number of processes when on a single node. + args.world_size = int(os.environ['WORLD_SIZE']) + else: + args.world_size = args.nproc if args.base_rank is None and 'BASE_RANK' in os.environ: args.base_rank = int(os.environ['BASE_RANK']) @@ -212,9 +217,6 @@ def _parse_args(): if args.master_port is None and 'MASTER_PORT' in os.environ: args.master_port = int(os.environ['MASTER_PORT']) - if args.world_size is None: - args.world_size = args.nproc - if args.world_size < args.nproc: raise ValueError(f'world_size({args.world_size}) cannot be less than nproc({args.nproc})') From 04ba0b67843247ae001ec5a2a2b495251bece057 Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Mon, 17 Jun 2024 12:09:55 -0700 Subject: [PATCH 18/69] Optionally use `flash-attn`'s CE loss for metrics (#3394) * yo * slam * cuda * cuda checks * test * fix_test * gloo * gloo * lint * lint --------- Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> Co-authored-by: Mihir Patel --- .github/workflows/pr-cpu.yaml | 2 +- composer/devices/device_gpu.py | 3 + composer/metrics/nlp.py | 22 ++++++- tests/checkpoint/test_state_dict.py | 6 +- tests/metrics/test_nlp_metrics.py | 89 +++++++++++++++++++++++++++++ 5 files changed, 118 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 1bdb383823..12f471749e 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -22,7 +22,7 @@ jobs: markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest - name: cpu-3.11-2.3 - container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest - name: cpu-doctest diff --git a/composer/devices/device_gpu.py b/composer/devices/device_gpu.py index 19cb0a774a..401368576e 100644 --- a/composer/devices/device_gpu.py +++ b/composer/devices/device_gpu.py @@ -12,6 +12,7 @@ import torch.backends.cudnn import torch.cuda import torch.cuda.amp +import torch.distributed as torch_dist import torch.utils.data from composer.devices.device import Device @@ -42,6 +43,8 @@ def __init__( ): if not torch.cuda.is_available(): raise ValueError('DeviceGPU cannot be created as torch.cuda is not available.') + if torch_dist.is_gloo_available(): + DeviceGPU.dist_backend = 'cuda:nccl,cpu:gloo' if device_id is None: device_id = dist.get_local_rank() self._device = torch.device(f'cuda:{device_id}') diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py index e6877292cf..c1562e5936 100644 --- a/composer/metrics/nlp.py +++ b/composer/metrics/nlp.py @@ -83,7 +83,21 @@ def __init__(self, dist_sync_on_step: bool = False, ignore_index: int = -100): super().__init__(dist_sync_on_step=dist_sync_on_step) self.ignore_index = ignore_index - self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=ignore_index, reduction='sum') + self.flash_loss_fn = None + try: + from flash_attn.losses.cross_entropy import CrossEntropyLoss as FusedCrossEntropyLoss + log.debug( + 'Found `flash_attn` installation. Using CrossEntropyLoss from `flash_attn`' + + 'to compute LanguageCrossEntropy metric for CUDA tensors, which will be faster.', + ) + self.flash_loss_fn = FusedCrossEntropyLoss(ignore_index=ignore_index, reduction='sum') + except ImportError: + if torch.cuda.is_available(): + log.debug( + 'Package `flash_attn` not installed. Using torch.nn.CrossEntropyLoss ' + + 'to compute LanguageCrossEntropy metric for CUDA tensors, which will be slower.', + ) + self.torch_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=ignore_index, reduction='sum') self.add_state('sum_loss', default=torch.tensor(0.), dist_reduce_fx='sum') self.add_state('total_items', default=torch.tensor(0), dist_reduce_fx='sum') @@ -104,7 +118,11 @@ def update(self, output: Union[Mapping, Tensor], target: Tensor) -> None: target = target.view(-1) logits = logits.view(target.shape[0], -1) - losses = self.loss_fn(logits, target) + # Use Flash attn's CE loss function, if available, if inputs are both CUDA tensors. + if self.flash_loss_fn is not None and target.is_cuda and logits.is_cuda: + losses = self.flash_loss_fn(logits, target) + else: + losses = self.torch_loss_fn(logits, target) total_items = (target != self.ignore_index).sum() self.total_items += total_items #type: ignore (third-party) diff --git a/tests/checkpoint/test_state_dict.py b/tests/checkpoint/test_state_dict.py index af0ca34961..bd14154dc9 100644 --- a/tests/checkpoint/test_state_dict.py +++ b/tests/checkpoint/test_state_dict.py @@ -7,6 +7,7 @@ import pytest import torch +import torch.distributed as torch_dist from packaging import version from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.optim import adam @@ -530,7 +531,10 @@ def test_get_metadata_sharded_model(model_type: str, tensor_type: str, world_siz assert 'model_name' in metadata_sd assert 'dist_backend' in metadata_sd - assert metadata_sd['dist_backend'] == 'nccl' + if torch_dist.is_gloo_available(): + assert metadata_sd['dist_backend'] == 'cuda:nccl,cpu:gloo' + else: + assert metadata_sd['dist_backend'] == 'nccl' @pytest.mark.filterwarnings('ignore:SWA has') diff --git a/tests/metrics/test_nlp_metrics.py b/tests/metrics/test_nlp_metrics.py index 7fe854bd96..9b198003d3 100644 --- a/tests/metrics/test_nlp_metrics.py +++ b/tests/metrics/test_nlp_metrics.py @@ -14,6 +14,7 @@ LanguagePerplexity, MaskedAccuracy, ) +from tests.common import device @pytest.mark.parametrize('ignore_index', [-100]) @@ -50,12 +51,100 @@ def test_masked_accuracy(ignore_index, num_classes): assert abs(final_acc - (1.0 / num_classes)) < 0.02 +@device('cpu', 'gpu') @pytest.mark.parametrize('ignore_index', [-100]) @pytest.mark.parametrize('batch_size', [1e2, 1e3]) @pytest.mark.parametrize('sequence_length', [128]) @pytest.mark.parametrize('num_classes', [2, 10]) @pytest.mark.parametrize('minibatch_size', [56, 256, 768]) +@pytest.mark.parametrize('tensor_device', ['cpu', 'gpu']) def test_cross_entropy( + device: str, + batch_size: float, + ignore_index: Optional[int], + sequence_length: int, + num_classes: int, + minibatch_size: int, + tensor_device: str, +): + """Sanity check to make sure that batched CrossEntropyLoss matches the expected performance. + + Generates a predicted distribution from a normal distribution, and a ground truth from a normal distribution. + Verifies Cross Entropy Loss against the baseline performance. + + Args: + device (str): the device to run the test on + batch_size (int): how many samples are in each batch + ignore_index (Optional[int]): if present, the class index to ignore in accuracy calculations. + sequence_length (int): the length of the generated sequence + num_classes (int): the number of classes in the classification task + minibatch_size (int): the minibatch size to simulate for model predictions + tensor_device (str): which device the input tensors to the metric are on + """ + + if device == 'cpu' and tensor_device == 'gpu': + pytest.skip('Skipping test that would try to use GPU tensors when only CPU is available.') + + batch_size = int(batch_size) + generated_preds = torch.randn((batch_size, sequence_length, num_classes)) + generated_true = torch.randint(low=0, high=num_classes, size=(batch_size, sequence_length)) + + assert ignore_index is not None + torchmetrics_xent = LanguageCrossEntropy(dist_sync_on_step=False, ignore_index=ignore_index) + ce_with_keys_metric = LanguageCrossEntropy(dist_sync_on_step=False, ignore_index=ignore_index) + + if tensor_device == 'cpu': + torchmetrics_xent = torchmetrics_xent.to('cpu') + ce_with_keys_metric = ce_with_keys_metric.to('cpu') + elif tensor_device == 'gpu': + torchmetrics_xent = torchmetrics_xent.to('cuda') + ce_with_keys_metric = ce_with_keys_metric.to('cuda') + + if device == 'gpu': + assert torchmetrics_xent.flash_loss_fn is not None + + labels_mask = torch.rand((batch_size, sequence_length)) + labels_mask[labels_mask > 0.8] = 1 + labels_mask[labels_mask <= 0.8] = 0 + labels_mask = labels_mask.bool() + generated_true[labels_mask] = ignore_index + + num_batches = math.ceil(batch_size / minibatch_size) + for batch_idx in range(num_batches): + begin_idx = (batch_idx * minibatch_size) + end_idx = ((batch_idx + 1) * minibatch_size) + preds_subset = generated_preds[begin_idx:end_idx] + true_subset = generated_true[begin_idx:end_idx] + + if tensor_device == 'cpu': + preds_subset = preds_subset.cpu() + true_subset = true_subset.cpu() + elif tensor_device == 'gpu': + preds_subset = preds_subset.cuda() + true_subset = true_subset.cuda() + + torchmetrics_xent.update(preds_subset, true_subset) + ce_with_keys_metric.update( + { + 'logits': preds_subset.view(-1, num_classes), + 'loss': cross_entropy(preds_subset.view(-1, num_classes), true_subset.view(-1)), + }, + true_subset.view(-1), + ) + + torchmetrics_loss = torchmetrics_xent.compute() + ce_with_keys_loss = ce_with_keys_metric.compute() + correct_loss = cross_entropy(generated_preds.view(-1, num_classes), generated_true.view(-1)) + assert torchmetrics_loss == ce_with_keys_loss + assert torch.isclose(correct_loss, torchmetrics_loss) + + +@pytest.mark.parametrize('ignore_index', [-100]) +@pytest.mark.parametrize('batch_size', [1e2, 1e3]) +@pytest.mark.parametrize('sequence_length', [128]) +@pytest.mark.parametrize('num_classes', [2, 10]) +@pytest.mark.parametrize('minibatch_size', [56, 256, 768]) +def test_torch_cpu_cross_entropy( batch_size: float, ignore_index: Optional[int], sequence_length: int, From 1dfd3bc999ad839f1bd83ecfecf01832e8965ccb Mon Sep 17 00:00:00 2001 From: Jesse Chan Date: Mon, 17 Jun 2024 12:30:39 -0700 Subject: [PATCH 19/69] log image fix (#3286) * log image fix Signed-off-by: Jesse Chan * fixed log image tests Signed-off-by: Jesse Chan * linter Signed-off-by: Jesse Chan * add simd requirement * post0? * versioning yada yada yada * guh * import fix? * update deps * fix * fix II * remove other dependency * debug statement, remove * post1?! * build from source * whitespace? * use pillow * delete a unit test and ignore some types * s/type/pyright * formatting * formatting * ignore more stuff * Apply suggestions from code review * remove rest * Update setup.py Co-authored-by: Mihir Patel * try no ignore * remove intenum --------- Signed-off-by: Jesse Chan Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> Co-authored-by: Milo Cress Co-authored-by: Mihir Patel --- .../utils/augmentation_primitives.py | 31 ++++++++++++++++--- composer/loggers/mlflow_logger.py | 3 +- setup.py | 2 +- tests/loggers/test_mlflow_logger.py | 5 +-- tests/test_docker.py | 18 ----------- 5 files changed, 32 insertions(+), 27 deletions(-) delete mode 100644 tests/test_docker.py diff --git a/composer/algorithms/utils/augmentation_primitives.py b/composer/algorithms/utils/augmentation_primitives.py index d2b2417e5c..d3ac807c02 100644 --- a/composer/algorithms/utils/augmentation_primitives.py +++ b/composer/algorithms/utils/augmentation_primitives.py @@ -30,6 +30,7 @@ import numpy as np from PIL import Image, ImageEnhance, ImageOps +from PIL.Image import Resampling, Transform AugmentationFn = Callable[[Image.Image, float], Image.Image] @@ -155,7 +156,7 @@ def rotate(pil_img: Image.Image, level: float): degrees = _int_parameter(_sample_level(level), 30) if np.random.uniform() > 0.5: degrees = -degrees - return pil_img.rotate(degrees, resample=Image.BILINEAR) + return pil_img.rotate(degrees, resample=Resampling.BILINEAR) def solarize(pil_img: Image.Image, level: float): @@ -183,7 +184,12 @@ def shear_x(pil_img: Image.Image, level: float): level = _float_parameter(_sample_level(level), 0.3) if np.random.uniform() > 0.5: level = -level - return pil_img.transform(pil_img.size, Image.AFFINE, (1, level, 0, 0, 1, 0), resample=Image.BILINEAR) + return pil_img.transform( + pil_img.size, + Transform.AFFINE, + (1, level, 0, 0, 1, 0), + resample=Resampling.BILINEAR, + ) def shear_y(pil_img: Image.Image, level: float): @@ -197,7 +203,12 @@ def shear_y(pil_img: Image.Image, level: float): level = _float_parameter(_sample_level(level), 0.3) if np.random.uniform() > 0.5: level = -level - return pil_img.transform(pil_img.size, Image.AFFINE, (1, 0, 0, level, 1, 0), resample=Image.BILINEAR) + return pil_img.transform( + pil_img.size, + Transform.AFFINE, + (1, 0, 0, level, 1, 0), + resample=Resampling.BILINEAR, + ) def translate_x(pil_img: Image.Image, level: float): @@ -211,7 +222,12 @@ def translate_x(pil_img: Image.Image, level: float): level = _int_parameter(_sample_level(level), pil_img.size[0] / 3) if np.random.random() > 0.5: level = -level - return pil_img.transform(pil_img.size, Image.AFFINE, (1, 0, level, 0, 1, 0), resample=Image.BILINEAR) + return pil_img.transform( + pil_img.size, + Transform.AFFINE, + (1, 0, level, 0, 1, 0), + resample=Resampling.BILINEAR, + ) def translate_y(pil_img: Image.Image, level: float): @@ -225,7 +241,12 @@ def translate_y(pil_img: Image.Image, level: float): level = _int_parameter(_sample_level(level), pil_img.size[1] / 3) if np.random.random() > 0.5: level = -level - return pil_img.transform(pil_img.size, Image.AFFINE, (1, 0, 0, 0, 1, level), resample=Image.BILINEAR) + return pil_img.transform( + pil_img.size, + Transform.AFFINE, + (1, 0, 0, 0, 1, level), + resample=Resampling.BILINEAR, + ) # The following augmentations overlap with corruptions in the ImageNet-C/CIFAR10-C test diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py index 9a64ef5d9d..03070c28f9 100644 --- a/composer/loggers/mlflow_logger.py +++ b/composer/loggers/mlflow_logger.py @@ -507,8 +507,9 @@ def log_images( assert isinstance(self._run_id, str) self._mlflow_client.log_image( image=image, - artifact_file=f'{name}_{step}_{im_ind}.png', + key=f'{name}_{step}_{im_ind}', run_id=self._run_id, + step=step, ) def post_close(self): diff --git a/setup.py b/setup.py index 5beb4d5136..cbffa0b79c 100644 --- a/setup.py +++ b/setup.py @@ -142,7 +142,7 @@ def package_files(prefix: str, directory: str, extension: str): 'cryptography==42.0.8', 'pytest-httpserver>=1.0.4,<1.1', 'setuptools<=59.5.0', - 'pillow==9.3.0', # Matches the Pillow version listed in the Dockerfile + 'pillow>=10.3.0,<11', ] extra_deps['system_metrics_monitor'] = { diff --git a/tests/loggers/test_mlflow_logger.py b/tests/loggers/test_mlflow_logger.py index 4fe221f52d..5ee6aab7a5 100644 --- a/tests/loggers/test_mlflow_logger.py +++ b/tests/loggers/test_mlflow_logger.py @@ -650,8 +650,9 @@ def before_forward(self, state: State, logger: Logger): experiment_id = run.info.experiment_id run_file_path = mlflow_uri / Path(experiment_id) / Path(run_id) - im_dir = run_file_path / Path('artifacts') - assert len(os.listdir(im_dir)) == expected_num_ims + im_dir = run_file_path / Path('artifacts') / Path('images') + # 2 (compressed & uncompressed) per image, and two log images calls in ImageLogger + assert len(os.listdir(im_dir)) == expected_num_ims * 2 * 2 @device('cpu') diff --git a/tests/test_docker.py b/tests/test_docker.py deleted file mode 100644 index 8a269d7563..0000000000 --- a/tests/test_docker.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -import os -import platform - -import PIL -import pytest - - -@pytest.mark.skipif( - 'composer-python' not in os.environ['PATH'] or 'Linux' not in platform.system(), - reason='Pillow-simd test only checks if using the composer docker', -) -class TestDocker: - - def test_pillow_simd(self): - assert 'post' in PIL.__version__, 'pillow-simd is not installed' From f7e17de45a439e4cc84b87fab124a5c5d2ac93e4 Mon Sep 17 00:00:00 2001 From: Evan Racah Date: Mon, 17 Jun 2024 15:58:32 -0700 Subject: [PATCH 20/69] [ckpt-rewr] Save state dict API (#3372) --- composer/checkpoint/save.py | 145 ++++++++++++++++++++++++++++ docs/source/conf.py | 1 - tests/checkpoint/helpers.py | 110 +++++++++++++++++++++ tests/checkpoint/test_save.py | 79 +++++++++++++++ tests/checkpoint/test_state_dict.py | 107 ++------------------ tests/common/compare.py | 36 ++++++- 6 files changed, 377 insertions(+), 101 deletions(-) create mode 100644 composer/checkpoint/save.py create mode 100644 tests/checkpoint/helpers.py create mode 100644 tests/checkpoint/test_save.py diff --git a/composer/checkpoint/save.py b/composer/checkpoint/save.py new file mode 100644 index 0000000000..72e5311d0f --- /dev/null +++ b/composer/checkpoint/save.py @@ -0,0 +1,145 @@ +# Copyright 2024 MosaicML Composer authors +# SPDX-License-Identifier: Apache-2.0 + +"""Useful functions for saving state dicts to disk.""" + +import logging +import os +import textwrap +import warnings +from pathlib import Path +from typing import Any, Dict, Optional, Union + +import torch +import torch.distributed.checkpoint as DCP +from packaging import version +from torch.distributed._shard.sharded_tensor import ShardedTensor +from torch.distributed._tensor import DTensor + +from composer.utils import dist +from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME, _write_checkpoint_file + +log = logging.getLogger(__name__) + + +def save_state_dict_to_disk( + state_dict: Dict[str, Any], + destination_file_path: str, + overwrite: bool = False, + save_format: str = 'pt', # or hf, safetensor +) -> Optional[str]: + """Saves a state dict to local disk. + + Args: + state_dict (Dict[str,Any]): The state dict to save. + destination_file_path (str): The path to save the state dict to. If sharded, + this should be the pth to a directory. Otherwise, it should be a path to a file. + overwrite (bool): If True, the file will be overwritten if it exists. + save_format (str): The format to save the state dict in. One of 'pt', 'hf', or 'safetensor'. + + Returns: + str: The full path to the saved state dict if (sharded is false and rank 0) or if sharded is true, otherwise None. + """ + if state_dict == {}: + return None + if is_state_dict_sharded(state_dict): + path_saved = _save_sharded_state_dict_to_disk(state_dict, destination_file_path, overwrite, save_format) + else: + if dist.get_global_rank() == 0: + path_saved = _save_full_state_dict_to_disk(state_dict, destination_file_path, overwrite, save_format) + else: + path_saved = None + + return path_saved + + +def _save_sharded_state_dict_to_disk( + state_dict: Dict[str, Any], + destination_file_path: str, + overwrite: bool = False, + save_format: str = 'pt', +) -> Optional[str]: + + if save_format != 'pt': + raise NotImplementedError( + f"Saving sharded state dict to disk in format {save_format} is not supported. Please choose from ['pt'].", + ) + + if state_dict == {}: + return None + + # If user specifies filename instead of directory suffixes, strip them and warn + if len(Path(destination_file_path).suffixes) > 0: + stripped_path = _strip_suffixes(destination_file_path) + warnings.warn( + textwrap.dedent( + f"""Sharded checkpoints require a directory path not a file path: + {destination_file_path} will have its extensions stripped and checkpoints will be saved in {stripped_path} + as {stripped_path}/{_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME}""", + ), + ) + destination_file_path = stripped_path + + if dist.get_global_rank() == 0 and not overwrite and os.path.exists(destination_file_path): + raise ValueError(f'Directory {destination_file_path} already exists. Set overwrite=True to overwrite it.') + + log.debug( + f'Starting saving of sharded state dict to {destination_file_path}/{_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME}', + ) + + # For 2.3.0 and above you can use checkpoint_id, but this version works the best for all versions + # of torch (and makes pyright happier) that we support, so we use it for now. + if version.parse(torch.__version__) < version.parse('2.2.0'): + DCP.save_state_dict(state_dict=state_dict, storage_writer=DCP.FileSystemWriter(destination_file_path)) + else: + DCP.save(state_dict=state_dict, storage_writer=DCP.FileSystemWriter(destination_file_path)) + + return destination_file_path + '/' + _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME + + +def _save_full_state_dict_to_disk( + state_dict: Dict[str, Any], + destination_file_path: str, + overwrite: bool = False, + save_format: str = 'pt', # or hf, safetensor +) -> Optional[str]: + + if save_format != 'pt': + raise NotImplementedError( + f"Saving sharded state dict to disk in format {save_format} is not supported. Please choose from ['pt'].", + ) + + if not overwrite and os.path.exists(destination_file_path): + raise ValueError(f'File {destination_file_path} already exists. Set overwrite=True to overwrite it.') + + if dist.get_global_rank() == 0: + _write_checkpoint_file(state_dict=state_dict, filename=destination_file_path) + return destination_file_path + return None + + +def is_state_dict_sharded(state_dict: Dict[str, Any]) -> bool: + """Determines if the state dict is sharded. + + Args: + state_dict (Dict[str, Any]): The state dict to check. + + Returns: + bool: Whether the state dict is sharded. + """ + for value in state_dict.values(): + if isinstance(value, ShardedTensor) or isinstance(value, DTensor): + return True + if isinstance(value, Dict): + is_sharded = is_state_dict_sharded(value) + if is_sharded: + return True + return False + + +def _strip_suffixes(path: Union[str, Path]) -> str: + path = Path(path) + for _ in path.suffixes: + path = path.with_suffix('') + + return str(path) diff --git a/docs/source/conf.py b/docs/source/conf.py index 45affa4a0e..533ce95b78 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -219,7 +219,6 @@ def _get_commit_sha() -> str: 'torch': ('https://pytorch.org/docs/stable/', None), 'torchvision': ('https://pytorch.org/vision/stable/', None), 'torchtext': ('https://pytorch.org/text/stable/', None), - 'torchmetrics': ('https://torchmetrics.readthedocs.io/en/latest/', None), 'libcloud': ('https://libcloud.readthedocs.io/en/stable/', None), 'PIL': ('https://pillow.readthedocs.io/en/stable', None), 'coolname': ('https://coolname.readthedocs.io/en/latest/', None), diff --git a/tests/checkpoint/helpers.py b/tests/checkpoint/helpers.py new file mode 100644 index 0000000000..047d30e813 --- /dev/null +++ b/tests/checkpoint/helpers.py @@ -0,0 +1,110 @@ +# Copyright 2024 MosaicML Composer authors +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict + +import torch +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp.api import CPUOffload +from torch.optim import adam + +from tests.common.models import EvenSimplerMLP, SimpleComposerMLP + +__all__ = [ + 'init_model_and_optimizer', + 'init_model', + 'init_optimizer', +] + + +def init_model_and_optimizer( + use_composer_model: bool, + num_classes=3, + batch_size=5, + num_features=8, + take_step=True, + use_fsdp=False, + tensor_type='sharded_tensor', + device='cuda', +): + model, loss_fn = init_model( + use_composer_model, + num_classes=num_classes, + num_features=num_features, + use_fsdp=use_fsdp, + tensor_type=tensor_type, + device=device, + ) + + optimizer = init_optimizer( + model, + loss_fn, + use_composer_model=use_composer_model, + num_classes=num_classes, + batch_size=batch_size, + num_features=num_features, + take_step=take_step, + device=device, + ) + + return model, optimizer + + +def init_model( + use_composer_model: bool = False, + num_classes=3, + num_features=8, + use_fsdp=False, + device='cuda', + tensor_type='sharded_tensor', + sync_module_states=True, + cpu_offload=False, +): + if use_composer_model: + model = SimpleComposerMLP(num_features=num_features, num_classes=num_classes, device=device) + loss_fn = model._loss_fn + else: + model = EvenSimplerMLP(num_features=num_features, num_out_features=num_classes, device=device) + loss_fn = torch.nn.CrossEntropyLoss() + + if use_fsdp: + fsdp_kwargs: Dict[str, Any] = dict( + use_orig_params=True, + sync_module_states=sync_module_states, # To enable easy comparison between rank 0 unsharded model and full state dict + cpu_offload=CPUOffload(offload_params=True) if cpu_offload else None, + device_id=torch.device('cpu') if device == 'cpu' else None, + ) + + if tensor_type == 'dtensor': + from torch.distributed.device_mesh import init_device_mesh + device_mesh = init_device_mesh('cuda', (2,)) + fsdp_kwargs['device_mesh'] = device_mesh + + model = FSDP( + model, + **fsdp_kwargs, + ) + + return model, loss_fn + + +def init_optimizer( + model, + loss_fn, + use_composer_model: bool = False, + num_classes=3, + batch_size=5, + num_features=8, + take_step=True, + device='cuda', +): + inputs = torch.randn(batch_size, num_features, device=device) + targets = torch.randint(low=0, high=num_classes, size=(batch_size,), device=device, dtype=torch.long) + batch = (inputs, targets) if use_composer_model else inputs + optimizer = adam.Adam(model.parameters()) + outputs = model(batch) + loss = loss_fn(outputs, targets) + loss.backward() + if take_step: + optimizer.step() + return optimizer diff --git a/tests/checkpoint/test_save.py b/tests/checkpoint/test_save.py new file mode 100644 index 0000000000..03b12bbcbc --- /dev/null +++ b/tests/checkpoint/test_save.py @@ -0,0 +1,79 @@ +# Copyright 2024 MosaicML Composer authors +# SPDX-License-Identifier: Apache-2.0 + +import os +import time +import uuid +from copy import deepcopy +from pathlib import Path + +import pytest +import torch +import torch.distributed.checkpoint as DCP +from packaging import version + +from composer.checkpoint.save import save_state_dict_to_disk +from composer.checkpoint.state_dict import get_model_state_dict +from composer.utils import dist +from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME +from tests.checkpoint.helpers import init_model +from tests.common.compare import deep_compare +from tests.common.markers import world_size + + +@world_size(1, 2) +@pytest.mark.gpu +@pytest.mark.parametrize('sharded_model', [False, True]) +def test_save_full_state_dict_to_disk(world_size: int, tmp_path: str, sharded_model: bool): + if world_size == 1 and sharded_model: + pytest.skip("Can't have a sharded model for world_size = 1") + destination_file_path = os.path.join(tmp_path, 'test.pt') + use_fsdp = sharded_model + model, _ = init_model(use_fsdp=use_fsdp, device='cuda', sync_module_states=True) + + state_dict = get_model_state_dict(model, sharded_state_dict=False) + path_saved = save_state_dict_to_disk(state_dict, destination_file_path=destination_file_path) + time.sleep(1) + if dist.get_global_rank() == 0: + assert path_saved is not None + assert path_saved == destination_file_path + assert os.path.exists(destination_file_path), f'{destination_file_path} does not exist' + loaded_state_dict = torch.load(path_saved, map_location='cuda') + deep_compare(state_dict, loaded_state_dict) + else: + assert path_saved is None + + +@world_size(2) +@pytest.mark.gpu +@pytest.mark.parametrize( + 'tensor_type', + [ + 'sharded_tensor', + pytest.param( + 'dtensor', + marks=pytest.mark.skipif( + version.parse(torch.__version__) < version.parse('2.2.0'), + reason='Requires torch>=2.2.0 for dtensor', + ), + ), + ], +) +def test_save_sharded_state_dict_to_disk(world_size: int, tmp_path: str, tensor_type: str): + + destination_file_path = os.path.join(tmp_path, str(uuid.uuid4())[:8]) + # Sync the path across all ranks + destination_file_path = dist.all_gather_object(destination_file_path)[0] + model, _ = init_model(use_fsdp=True, device='cuda', tensor_type=tensor_type) + + state_dict = get_model_state_dict(model, sharded_state_dict=True) + loaded_in_state_dict = deepcopy(state_dict) + path_saved = save_state_dict_to_disk(state_dict, destination_file_path=destination_file_path, overwrite=True) + assert path_saved == f'{destination_file_path}/{_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME}' + assert path_saved is not None + load_path = str(Path(path_saved).parent) + if version.parse(torch.__version__) < version.parse('2.2.0'): + DCP.load_state_dict(state_dict=loaded_in_state_dict, storage_reader=DCP.FileSystemReader(load_path)) + else: + DCP.load(state_dict=loaded_in_state_dict, storage_reader=DCP.FileSystemReader(load_path)) + deep_compare(state_dict, loaded_in_state_dict) diff --git a/tests/checkpoint/test_state_dict.py b/tests/checkpoint/test_state_dict.py index bd14154dc9..e010440836 100644 --- a/tests/checkpoint/test_state_dict.py +++ b/tests/checkpoint/test_state_dict.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import datetime -from typing import Any, Dict +from typing import Any from unittest.mock import MagicMock import pytest @@ -10,7 +10,6 @@ import torch.distributed as torch_dist from packaging import version from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.optim import adam from torch.optim.lr_scheduler import StepLR from torch.utils.data import DataLoader @@ -25,6 +24,7 @@ from composer.core import State from composer.devices import DeviceCPU, DeviceGPU from composer.utils import dist, reproducibility +from tests.checkpoint.helpers import init_model_and_optimizer from tests.common.compare import deep_compare from tests.common.markers import world_size from tests.common.models import EvenSimplerMLP, SimpleComposerMLP, configure_tiny_gpt2_hf_model @@ -247,101 +247,10 @@ def test_get_model_state_dict_precision_unsharded_model(precision: str, use_comp assert tens.dtype == precision -def _init_model_and_optimizer( - use_composer_model: bool, - num_classes=3, - batch_size=5, - num_features=8, - take_step=True, - use_fsdp=False, - tensor_type='sharded_tensor', - device='cuda', -): - model, loss_fn = _init_model( - use_composer_model, - num_classes=num_classes, - batch_size=batch_size, - num_features=num_features, - use_fsdp=use_fsdp, - tensor_type=tensor_type, - device=device, - ) - - optimizer = _init_optimizer( - model, - loss_fn, - use_composer_model=use_composer_model, - num_classes=num_classes, - batch_size=batch_size, - num_features=num_features, - take_step=take_step, - device=device, - ) - - return model, optimizer - - -def _init_model( - use_composer_model: bool = False, - num_classes=3, - batch_size=5, - num_features=8, - use_fsdp=False, - device='cuda', - tensor_type='sharded_tensor', -): - if use_composer_model: - model = SimpleComposerMLP(num_features=num_features, num_classes=num_classes, device=device) - loss_fn = model._loss_fn - else: - model = EvenSimplerMLP(num_features=num_features, num_out_features=num_classes, device=device) - loss_fn = torch.nn.CrossEntropyLoss() - - if use_fsdp: - fsdp_kwargs: Dict[str, Any] = dict( - use_orig_params=True, - sync_module_states=True, # To enable easy comparison between rank 0 unsharded model and full state dict - ) - - if tensor_type == 'dtensor': - from torch.distributed.device_mesh import init_device_mesh - device_mesh = init_device_mesh('cuda', (2,)) - fsdp_kwargs['device_mesh'] = device_mesh - - model = FSDP( - model, - **fsdp_kwargs, - ) - - return model, loss_fn - - -def _init_optimizer( - model, - loss_fn, - use_composer_model: bool = False, - num_classes=3, - batch_size=5, - num_features=8, - take_step=True, - device='cuda', -): - inputs = torch.randn(batch_size, num_features, device=device) - targets = torch.randint(low=0, high=num_classes, size=(batch_size,), device=device, dtype=torch.long) - batch = (inputs, targets) if use_composer_model else inputs - optimizer = adam.Adam(model.parameters()) - outputs = model(batch) - loss = loss_fn(outputs, targets) - loss.backward() - if take_step: - optimizer.step() - return optimizer - - @pytest.mark.gpu @pytest.mark.parametrize('use_composer_model', [True, False]) def test_get_optim_state_dict_unsharded_model(use_composer_model: bool): - model, optimizer = _init_model_and_optimizer(use_composer_model=use_composer_model, take_step=True) + model, optimizer = init_model_and_optimizer(use_composer_model=use_composer_model, take_step=True) optim_state_dict = get_optim_state_dict(model, optimizer) # Dict mapping parameter index to optimizer state for that parameter. @@ -385,7 +294,7 @@ def test_get_optim_state_dict_unsharded_model(use_composer_model: bool): ) @pytest.mark.parametrize('use_composer_model', [True, False]) def test_get_optim_state_dict_precision_unsharded_model(precision: str, use_composer_model: bool): - model, optimizer = _init_model_and_optimizer(use_composer_model=use_composer_model, take_step=True) + model, optimizer = init_model_and_optimizer(use_composer_model=use_composer_model, take_step=True) optim_state_dict = get_optim_state_dict(model, optimizer, precision=precision) for param_state in optim_state_dict['state'].values(): assert param_state['exp_avg'].dtype == precision @@ -400,7 +309,7 @@ def test_get_optim_dict_full_for_sharded_model(world_size, tensor_type, use_comp if tensor_type == 'dtensor' and version.parse(torch.__version__) < version.parse('2.2.0'): pytest.skip('DTensor is only supported in PyTorch >= 2.2.0') - model, optimizer = _init_model_and_optimizer( + model, optimizer = init_model_and_optimizer( use_composer_model=use_composer_model, take_step=True, use_fsdp=True, @@ -427,7 +336,7 @@ def test_get_optim_dict_sharded_for_sharded_model(world_size, tensor_type, use_c if tensor_type == 'dtensor' and version.parse(torch.__version__) < version.parse('2.2.0'): pytest.skip('DTensor is only supported in PyTorch >= 2.2.0') - model, optimizer = _init_model_and_optimizer( + model, optimizer = init_model_and_optimizer( use_composer_model=use_composer_model, take_step=True, use_fsdp=True, @@ -540,7 +449,7 @@ def test_get_metadata_sharded_model(model_type: str, tensor_type: str, world_siz @pytest.mark.filterwarnings('ignore:SWA has') def test_get_resumption_state_dict(): - model, optimizer = _init_model_and_optimizer(use_composer_model=True, take_step=True, device='cpu') + model, optimizer = init_model_and_optimizer(use_composer_model=True, take_step=True, device='cpu') rank_zero_seed = 10 run_name = 'test_run' @@ -605,7 +514,7 @@ def test_get_resumption_state_dict_gpu(): else: from torch.cuda.amp.grad_scaler import GradScaler - model, _ = _init_model_and_optimizer(use_composer_model=True, take_step=False, device='cuda') + model, _ = init_model_and_optimizer(use_composer_model=True, take_step=False, device='cuda') rank_zero_seed = 10 run_name = 'test_run' diff --git a/tests/common/compare.py b/tests/common/compare.py index 432ac55dfd..79dfe573bb 100644 --- a/tests/common/compare.py +++ b/tests/common/compare.py @@ -7,6 +7,8 @@ import numpy as np import torch import torchmetrics +from torch.distributed._shard.sharded_tensor import ShardedTensor +from torch.distributed._tensor import DTensor from composer import Time from composer.core.time import TimeUnit @@ -39,7 +41,7 @@ def _check_item( assert type(item1) == type(item2) assert item1 == item2, f'{path} differs: {item1} != {item2}' return - if isinstance(item1, torch.Tensor): + if isinstance(item1, torch.Tensor) and not (isinstance(item1, ShardedTensor) or isinstance(item1, DTensor)): assert isinstance(item2, torch.Tensor) if item1.device != item2.device: item1 = item1.cpu() @@ -58,6 +60,16 @@ def _check_item( assert isinstance(item2, type(item1)), f'{path} differs: {item1} != {item2}' _check_list_recursively(item1, item2, path, atol=atol, rtol=rtol) return + if isinstance(item1, ShardedTensor): + assert isinstance(item2, type(item1)), f'{path} differs: {item1} != {item2}' + _check_sharded_tensor_recursively(item1, item2, path, atol=atol, rtol=rtol) + return + + if isinstance(item1, DTensor): + assert isinstance(item2, type(item1)), f'{path} differs: {item1} != {item2}' + _check_dtensor_recursively(item1, item2, path, atol=atol, rtol=rtol) + return + if isinstance(item1, torchmetrics.Metric): assert isinstance(item2, torchmetrics.Metric), f'{path} differs: {item1} != {item2}' # Increase update count so Torchmetrics doesn't throw warning when computing two metrics which haven't been updated @@ -84,6 +96,28 @@ def _check_item( raise NotImplementedError(f'Unsupported item type: {type(item1)}') +def _check_dtensor_recursively( + dtensor1: DTensor, + dtensor2: DTensor, + path: str, + atol: float, + rtol: float, +): + tensor1, tensor2 = dtensor1.to_local(), dtensor2.to_local() + _check_item(tensor1, tensor2, path, atol=atol, rtol=rtol) + + +def _check_sharded_tensor_recursively( + sharded_tensor1: ShardedTensor, + sharded_tensor2: ShardedTensor, + path: str, + atol: float, + rtol: float, +): + tensor1, tensor2 = sharded_tensor1.local_tensor(), sharded_tensor2.local_tensor() + _check_item(tensor1, tensor2, path, atol=atol, rtol=rtol) + + def _check_list_recursively( list1: Union[tuple[Any], list[Any]], list2: Union[tuple[Any], list[Any]], From 0a1a6a457a7a11b51dcd652cc93dc61a2aef246e Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Mon, 17 Jun 2024 20:14:17 -0700 Subject: [PATCH 21/69] Revert "Optionally use `flash-attn`'s CE loss for metrics (#3394)" (#3408) This reverts commit 2cf9262e988c7cc4ee107259b98efec0298c5017. revert dat boi --- .github/workflows/pr-cpu.yaml | 2 +- composer/devices/device_gpu.py | 3 - composer/metrics/nlp.py | 22 +------ tests/checkpoint/test_state_dict.py | 6 +- tests/metrics/test_nlp_metrics.py | 89 ----------------------------- 5 files changed, 4 insertions(+), 118 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 12f471749e..1bdb383823 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -22,7 +22,7 @@ jobs: markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest - name: cpu-3.11-2.3 - container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest - name: cpu-doctest diff --git a/composer/devices/device_gpu.py b/composer/devices/device_gpu.py index 401368576e..19cb0a774a 100644 --- a/composer/devices/device_gpu.py +++ b/composer/devices/device_gpu.py @@ -12,7 +12,6 @@ import torch.backends.cudnn import torch.cuda import torch.cuda.amp -import torch.distributed as torch_dist import torch.utils.data from composer.devices.device import Device @@ -43,8 +42,6 @@ def __init__( ): if not torch.cuda.is_available(): raise ValueError('DeviceGPU cannot be created as torch.cuda is not available.') - if torch_dist.is_gloo_available(): - DeviceGPU.dist_backend = 'cuda:nccl,cpu:gloo' if device_id is None: device_id = dist.get_local_rank() self._device = torch.device(f'cuda:{device_id}') diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py index c1562e5936..e6877292cf 100644 --- a/composer/metrics/nlp.py +++ b/composer/metrics/nlp.py @@ -83,21 +83,7 @@ def __init__(self, dist_sync_on_step: bool = False, ignore_index: int = -100): super().__init__(dist_sync_on_step=dist_sync_on_step) self.ignore_index = ignore_index - self.flash_loss_fn = None - try: - from flash_attn.losses.cross_entropy import CrossEntropyLoss as FusedCrossEntropyLoss - log.debug( - 'Found `flash_attn` installation. Using CrossEntropyLoss from `flash_attn`' + - 'to compute LanguageCrossEntropy metric for CUDA tensors, which will be faster.', - ) - self.flash_loss_fn = FusedCrossEntropyLoss(ignore_index=ignore_index, reduction='sum') - except ImportError: - if torch.cuda.is_available(): - log.debug( - 'Package `flash_attn` not installed. Using torch.nn.CrossEntropyLoss ' + - 'to compute LanguageCrossEntropy metric for CUDA tensors, which will be slower.', - ) - self.torch_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=ignore_index, reduction='sum') + self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=ignore_index, reduction='sum') self.add_state('sum_loss', default=torch.tensor(0.), dist_reduce_fx='sum') self.add_state('total_items', default=torch.tensor(0), dist_reduce_fx='sum') @@ -118,11 +104,7 @@ def update(self, output: Union[Mapping, Tensor], target: Tensor) -> None: target = target.view(-1) logits = logits.view(target.shape[0], -1) - # Use Flash attn's CE loss function, if available, if inputs are both CUDA tensors. - if self.flash_loss_fn is not None and target.is_cuda and logits.is_cuda: - losses = self.flash_loss_fn(logits, target) - else: - losses = self.torch_loss_fn(logits, target) + losses = self.loss_fn(logits, target) total_items = (target != self.ignore_index).sum() self.total_items += total_items #type: ignore (third-party) diff --git a/tests/checkpoint/test_state_dict.py b/tests/checkpoint/test_state_dict.py index e010440836..4f719254a7 100644 --- a/tests/checkpoint/test_state_dict.py +++ b/tests/checkpoint/test_state_dict.py @@ -7,7 +7,6 @@ import pytest import torch -import torch.distributed as torch_dist from packaging import version from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.optim.lr_scheduler import StepLR @@ -440,10 +439,7 @@ def test_get_metadata_sharded_model(model_type: str, tensor_type: str, world_siz assert 'model_name' in metadata_sd assert 'dist_backend' in metadata_sd - if torch_dist.is_gloo_available(): - assert metadata_sd['dist_backend'] == 'cuda:nccl,cpu:gloo' - else: - assert metadata_sd['dist_backend'] == 'nccl' + assert metadata_sd['dist_backend'] == 'nccl' @pytest.mark.filterwarnings('ignore:SWA has') diff --git a/tests/metrics/test_nlp_metrics.py b/tests/metrics/test_nlp_metrics.py index 9b198003d3..7fe854bd96 100644 --- a/tests/metrics/test_nlp_metrics.py +++ b/tests/metrics/test_nlp_metrics.py @@ -14,7 +14,6 @@ LanguagePerplexity, MaskedAccuracy, ) -from tests.common import device @pytest.mark.parametrize('ignore_index', [-100]) @@ -51,100 +50,12 @@ def test_masked_accuracy(ignore_index, num_classes): assert abs(final_acc - (1.0 / num_classes)) < 0.02 -@device('cpu', 'gpu') @pytest.mark.parametrize('ignore_index', [-100]) @pytest.mark.parametrize('batch_size', [1e2, 1e3]) @pytest.mark.parametrize('sequence_length', [128]) @pytest.mark.parametrize('num_classes', [2, 10]) @pytest.mark.parametrize('minibatch_size', [56, 256, 768]) -@pytest.mark.parametrize('tensor_device', ['cpu', 'gpu']) def test_cross_entropy( - device: str, - batch_size: float, - ignore_index: Optional[int], - sequence_length: int, - num_classes: int, - minibatch_size: int, - tensor_device: str, -): - """Sanity check to make sure that batched CrossEntropyLoss matches the expected performance. - - Generates a predicted distribution from a normal distribution, and a ground truth from a normal distribution. - Verifies Cross Entropy Loss against the baseline performance. - - Args: - device (str): the device to run the test on - batch_size (int): how many samples are in each batch - ignore_index (Optional[int]): if present, the class index to ignore in accuracy calculations. - sequence_length (int): the length of the generated sequence - num_classes (int): the number of classes in the classification task - minibatch_size (int): the minibatch size to simulate for model predictions - tensor_device (str): which device the input tensors to the metric are on - """ - - if device == 'cpu' and tensor_device == 'gpu': - pytest.skip('Skipping test that would try to use GPU tensors when only CPU is available.') - - batch_size = int(batch_size) - generated_preds = torch.randn((batch_size, sequence_length, num_classes)) - generated_true = torch.randint(low=0, high=num_classes, size=(batch_size, sequence_length)) - - assert ignore_index is not None - torchmetrics_xent = LanguageCrossEntropy(dist_sync_on_step=False, ignore_index=ignore_index) - ce_with_keys_metric = LanguageCrossEntropy(dist_sync_on_step=False, ignore_index=ignore_index) - - if tensor_device == 'cpu': - torchmetrics_xent = torchmetrics_xent.to('cpu') - ce_with_keys_metric = ce_with_keys_metric.to('cpu') - elif tensor_device == 'gpu': - torchmetrics_xent = torchmetrics_xent.to('cuda') - ce_with_keys_metric = ce_with_keys_metric.to('cuda') - - if device == 'gpu': - assert torchmetrics_xent.flash_loss_fn is not None - - labels_mask = torch.rand((batch_size, sequence_length)) - labels_mask[labels_mask > 0.8] = 1 - labels_mask[labels_mask <= 0.8] = 0 - labels_mask = labels_mask.bool() - generated_true[labels_mask] = ignore_index - - num_batches = math.ceil(batch_size / minibatch_size) - for batch_idx in range(num_batches): - begin_idx = (batch_idx * minibatch_size) - end_idx = ((batch_idx + 1) * minibatch_size) - preds_subset = generated_preds[begin_idx:end_idx] - true_subset = generated_true[begin_idx:end_idx] - - if tensor_device == 'cpu': - preds_subset = preds_subset.cpu() - true_subset = true_subset.cpu() - elif tensor_device == 'gpu': - preds_subset = preds_subset.cuda() - true_subset = true_subset.cuda() - - torchmetrics_xent.update(preds_subset, true_subset) - ce_with_keys_metric.update( - { - 'logits': preds_subset.view(-1, num_classes), - 'loss': cross_entropy(preds_subset.view(-1, num_classes), true_subset.view(-1)), - }, - true_subset.view(-1), - ) - - torchmetrics_loss = torchmetrics_xent.compute() - ce_with_keys_loss = ce_with_keys_metric.compute() - correct_loss = cross_entropy(generated_preds.view(-1, num_classes), generated_true.view(-1)) - assert torchmetrics_loss == ce_with_keys_loss - assert torch.isclose(correct_loss, torchmetrics_loss) - - -@pytest.mark.parametrize('ignore_index', [-100]) -@pytest.mark.parametrize('batch_size', [1e2, 1e3]) -@pytest.mark.parametrize('sequence_length', [128]) -@pytest.mark.parametrize('num_classes', [2, 10]) -@pytest.mark.parametrize('minibatch_size', [56, 256, 768]) -def test_torch_cpu_cross_entropy( batch_size: float, ignore_index: Optional[int], sequence_length: int, From 0d6ef2623f278685b7bff0831e1d46c95dbfb8c4 Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Mon, 17 Jun 2024 20:52:41 -0700 Subject: [PATCH 22/69] CPU tests image fix (#3409) * Revert "Optionally use `flash-attn`'s CE loss for metrics (#3394)" This reverts commit 2cf9262e988c7cc4ee107259b98efec0298c5017. revert dat boi * remove * slamm --- .github/workflows/pr-cpu.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 1bdb383823..12f471749e 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -22,7 +22,7 @@ jobs: markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest - name: cpu-3.11-2.3 - container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and not doctest pytest_command: coverage run -m pytest - name: cpu-doctest From dac19958fd2bfdc48149bd4d10be1a90d2c15fb4 Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Tue, 18 Jun 2024 13:29:34 -0400 Subject: [PATCH 23/69] Add setter for epoch in iteration (#3407) --- composer/core/time.py | 25 +++++++++++++++++-------- tests/test_time.py | 7 +++++++ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/composer/core/time.py b/composer/core/time.py index 3916dd7659..00af1fd456 100644 --- a/composer/core/time.py +++ b/composer/core/time.py @@ -525,13 +525,8 @@ def __init__( raise ValueError(f'The `token` argument has units of {token.unit}; not {TimeUnit.TOKEN}.') self._token = token - epoch_in_iteration = Time.from_input(epoch_in_iteration, TimeUnit.EPOCH) - if epoch_in_iteration.unit != TimeUnit.EPOCH: - raise ValueError(( - f'The `epoch_in_iteration` argument has units of {epoch_in_iteration.unit}; ' - f'not {TimeUnit.EPOCH}.' - )) - self._epoch_in_iteration = epoch_in_iteration + self._epoch_in_iteration = Time(0, TimeUnit.EPOCH) + self.epoch_in_iteration = epoch_in_iteration token_in_iteration = Time.from_input(token_in_iteration, TimeUnit.TOKEN) if token_in_iteration.unit != TimeUnit.TOKEN: @@ -619,7 +614,7 @@ def load_state_dict(self, state: dict[str, Any]) -> None: if 'iteration' in state: self._iteration = Time(state['iteration'], TimeUnit.ITERATION) if 'epoch_in_iteration' in state: - self._epoch_in_iteration = Time(state['epoch_in_iteration'], TimeUnit.EPOCH) + self.epoch_in_iteration = Time(state['epoch_in_iteration'], TimeUnit.EPOCH) if 'token_in_iteration' in state: self._token_in_iteration = Time(state['token_in_iteration'], TimeUnit.TOKEN) if 'iteration_wct' in state: @@ -655,6 +650,20 @@ def epoch_in_iteration(self) -> Time[int]: """The epoch count in the current iteration (resets at 0 at the beginning of every iteration).""" return self._epoch_in_iteration + @epoch_in_iteration.setter + def epoch_in_iteration( + self, + epoch_in_iteration: Union[int, Time[int]], # pyright: ignore[reportPropertyTypeMismatch] + ): + """Sets epoch count in the current iteration.""" + epoch_in_iteration = Time.from_input(epoch_in_iteration, TimeUnit.EPOCH) + if epoch_in_iteration.unit != TimeUnit.EPOCH: + raise ValueError(( + f'The `epoch_in_iteration` argument has units of {epoch_in_iteration.unit}; ' + f'not {TimeUnit.EPOCH}.' + )) + self._epoch_in_iteration = epoch_in_iteration + @property def token_in_iteration(self) -> Time[int]: """The token count in the current iteration (resets at 0 at the beginning of every iteration).""" diff --git a/tests/test_time.py b/tests/test_time.py index 1545eaa3b1..d585d9af36 100644 --- a/tests/test_time.py +++ b/tests/test_time.py @@ -146,6 +146,13 @@ def test_timestamp_update(): assert timestamp is not timestamp_2 +def test_set_timestamp(): + timestamp = Timestamp(epoch_in_iteration=1) + assert timestamp.epoch_in_iteration == 1 + timestamp.epoch_in_iteration = 2 + assert timestamp.epoch_in_iteration == 2 + + def test_timestamp_to_next_batch_epoch_iteration(): timestamp = Timestamp() # Step batch 0 in epoch 0 From 567c6e5065aef2b7a89cd7f7334b489ebf5c0c34 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 18 Jun 2024 13:02:34 -0700 Subject: [PATCH 24/69] Move pillow dep as required (#3412) * move pil dep * remove pillow simd --- docker/Dockerfile | 21 --------------------- setup.py | 16 +--------------- 2 files changed, 1 insertion(+), 36 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index e547b44c7b..970af2f1ef 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -25,13 +25,6 @@ ARG PYTORCH_VERSION=1.13.1 # version that corresponds to the PyTorch version ARG TORCHVISION_VERSION=0.14.1 -# In the Dockerimage, Pillow-SIMD is installed instead of Pillow. To trick pip into thinking that -# Pillow is also installed (so it won't override it with a future pip install), a Pillow stub is included -# PILLOW_PSEUDOVERSION is the Pillow version that pip thinks is installed -# PILLOW_SIMD_VERSION is the actual version of pillow-simd that is installed. -ARG PILLOW_PSEUDOVERSION=9.3.0 -ARG PILLOW_SIMD_VERSION=9.0.0.post1 - # Version of the Mellanox Drivers to install (for InfiniBand support) # Leave blank for no Mellanox Drivers ARG MOFED_VERSION=5.5-1.0.3.2 @@ -181,20 +174,6 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \ RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' 'setuptools<70.0.0' -##################### -# Install pillow-simd -##################### -ARG PILLOW_PSEUDOVERSION -ARG PILLOW_SIMD_VERSION - -# pillow_stub tricks pip into thinking that it installed pillow, -# so when pillow_simd is installed, other packages won't later override it -COPY pillow_stub /tmp/pillow_stub - -RUN pip${PYTHON_VERSION} install --no-cache-dir --upgrade /tmp/pillow_stub && \ - pip${PYTHON_VERSION} install --no-cache-dir --upgrade pillow_simd==${PILLOW_SIMD_VERSION} && \ - rm -rf /tmp/pillow_stub - ################# # Install Pytorch ################# diff --git a/setup.py b/setup.py index cbffa0b79c..207fe841c9 100644 --- a/setup.py +++ b/setup.py @@ -91,6 +91,7 @@ def package_files(prefix: str, directory: str, extension: str): 'packaging>=21.3.0,<24.2', 'importlib-metadata>=5.0.0,<7', 'mosaicml-cli>=0.5.25,<0.7', + 'pillow>=10.3.0,<11', ] extra_deps = {} @@ -142,7 +143,6 @@ def package_files(prefix: str, directory: str, extension: str): 'cryptography==42.0.8', 'pytest-httpserver>=1.0.4,<1.1', 'setuptools<=59.5.0', - 'pillow>=10.3.0,<11', ] extra_deps['system_metrics_monitor'] = { @@ -280,17 +280,3 @@ def package_files(prefix: str, directory: str, extension: str): ext_package='composer', cmdclass={'develop': develop}, ) - -# only visible if user installs with verbose -v flag -# Printing to stdout as not to interfere with setup.py CLI flags (e.g. --version) -print('*' * 20, file=sys.stderr) -print( - textwrap.dedent( - """\ - NOTE: For best performance, we recommend installing Pillow-SIMD - for accelerated image processing operations. To install: - \t pip uninstall pillow && pip install pillow-simd""", - ), - file=sys.stderr, -) -print('*' * 20, file=sys.stderr) From f26a1d32b67947f06fbcedfddddf9d46dbaf2d78 Mon Sep 17 00:00:00 2001 From: Jack Zhang <170473087+JackZ-db@users.noreply.github.com> Date: Tue, 18 Jun 2024 15:44:35 -0700 Subject: [PATCH 25/69] fixing mlflow logging to Databricks workspace file paths with /Shared/ prefix (#3410) * fixing os file path with /Shared/ prefix * lstrip '/' from experiment name if not '/Shared/' or '/Users/' Co-authored-by: Mihir Patel * doesnt modify experiment name if it has '/Shared/' as a prefix * fix formatting * lint --------- Co-authored-by: Mihir Patel --- composer/loggers/mlflow_logger.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py index 03070c28f9..aed32eea39 100644 --- a/composer/loggers/mlflow_logger.py +++ b/composer/loggers/mlflow_logger.py @@ -150,7 +150,12 @@ def __init__( ) assert self.experiment_name is not None # type hint - if os.getenv('DATABRICKS_TOKEN') is not None and not self.experiment_name.startswith('/Users/'): + if os.getenv( + 'DATABRICKS_TOKEN', + ) is not None and not self.experiment_name.startswith(( + '/Users/', + '/Shared/', + )): try: from databricks.sdk import WorkspaceClient except ImportError as e: @@ -160,7 +165,7 @@ def __init__( conda_channel='conda-forge', ) from e databricks_username = WorkspaceClient().current_user.me().user_name or '' - self.experiment_name = '/' + os.path.join('Users', databricks_username, self.experiment_name) + self.experiment_name = os.path.join('/Users', databricks_username, self.experiment_name.strip('/')) self._mlflow_client = MlflowClient(self.tracking_uri) # Set experiment From 894a1923393a36900ebab1b45b04ae41f07a5a39 Mon Sep 17 00:00:00 2001 From: Karan Jariwala Date: Tue, 18 Jun 2024 21:52:26 -0700 Subject: [PATCH 26/69] Bump version v0.23.3 (#3414) * Bump version v0.23.3 * update the composer version --- composer/_version.py | 2 +- docker/README.md | 4 ++-- docker/build_matrix.yaml | 12 ++++++------ docker/generate_build_matrix.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/composer/_version.py b/composer/_version.py index a38b61a722..1e088461f8 100644 --- a/composer/_version.py +++ b/composer/_version.py @@ -3,4 +3,4 @@ """The Composer Version.""" -__version__ = '0.24.0.dev0' +__version__ = '0.23.3' diff --git a/docker/README.md b/docker/README.md index 05c97fe626..e10af0a194 100644 --- a/docker/README.md +++ b/docker/README.md @@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the | Composer Version | CUDA Support | Docker Tag | |--------------------|----------------|----------------------------------------------------------------| -| 0.23.2 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.23.2` | -| 0.23.2 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.2_cpu` | +| 0.23.3 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.23.3` | +| 0.23.3 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.3_cpu` | **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 73074988b9..faa21b8e89 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -208,9 +208,9 @@ TORCHVISION_VERSION: 0.16.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 - COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.2 + COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.3 CUDA_VERSION: 12.1.1 - IMAGE_NAME: composer-0-23-2 + IMAGE_NAME: composer-0-23-3 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -231,15 +231,15 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/composer:0.23.2 + - mosaicml/composer:0.23.3 - mosaicml/composer:latest TARGET: composer_stage TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 - COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.2 + COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.3 CUDA_VERSION: '' - IMAGE_NAME: composer-0-23-2-cpu + IMAGE_NAME: composer-0-23-3-cpu MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' @@ -247,7 +247,7 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/composer:0.23.2_cpu + - mosaicml/composer:0.23.3_cpu - mosaicml/composer:latest_cpu TARGET: composer_stage TORCHVISION_VERSION: 0.18.1 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index bf961a756c..9a634b0d36 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -231,7 +231,7 @@ def _main(): composer_entries = [] # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images - composer_versions = ['0.23.2'] # Only build images for the latest composer version + composer_versions = ['0.23.3'] # Only build images for the latest composer version composer_python_versions = [PRODUCTION_PYTHON_VERSION] # just build composer against the latest for product in itertools.product(composer_python_versions, composer_versions, cuda_options): From 459a0197ceece6df1d4cbf1de9576b23205cfdca Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 20 Jun 2024 10:20:00 -0700 Subject: [PATCH 27/69] Update numpy requirement from <1.27.0,>=1.21.5 to >=1.21.5,<2.1.0 (#3406) * Update numpy requirement from <1.27.0,>=1.21.5 to >=1.21.5,<2.1.0 Updates the requirements on [numpy](https://github.com/numpy/numpy) to permit the latest version. - [Release notes](https://github.com/numpy/numpy/releases) - [Changelog](https://github.com/numpy/numpy/blob/main/doc/RELEASE_WALKTHROUGH.rst) - [Commits](https://github.com/numpy/numpy/compare/v1.21.5...v2.0.0) --- updated-dependencies: - dependency-name: numpy dependency-type: direct:production ... Signed-off-by: dependabot[bot] * commit * fix typing --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Mihir Patel Co-authored-by: Saaketh Narayan --- composer/algorithms/augmix/augmix.py | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/composer/algorithms/augmix/augmix.py b/composer/algorithms/augmix/augmix.py index f19ae36243..412fd737d8 100644 --- a/composer/algorithms/augmix/augmix.py +++ b/composer/algorithms/augmix/augmix.py @@ -96,8 +96,8 @@ def _augmix_pil_image( aug = np.random.choice(augmentation_set) augmented_image = aug(augmented_image, severity) augmented_combination += chain_weights[chain_i] * np.asarray(augmented_image) - mixed = (1 - mixing_weight) * np.asarray(img_pil) + mixing_weight * augmented_combination - mixed = Image.fromarray(np.uint8(mixed)) + mixed = (1 - mixing_weight) * np.asarray(img_pil, dtype=np.float32) + mixing_weight * augmented_combination + mixed = Image.fromarray(np.uint8(mixed)) # type: ignore return mixed f_pil = functools.partial( diff --git a/setup.py b/setup.py index 207fe841c9..29f7a8466b 100644 --- a/setup.py +++ b/setup.py @@ -83,7 +83,7 @@ def package_files(prefix: str, directory: str, extension: str): 'torchvision>=0.13.1,<0.18.2', 'torch>=2.1.2,<2.3.2', 'requests>=2.26.0,<3', - 'numpy>=1.21.5,<1.27.0', + 'numpy>=1.21.5,<2.1.0', 'psutil>=5.8.0,<6', 'coolname>=1.1.0,<3', 'tabulate==0.9.0', # for auto-generating tables From 7a4644acad747f68c430ab6ed56d9aa66cde6555 Mon Sep 17 00:00:00 2001 From: Karan Jariwala Date: Thu, 20 Jun 2024 10:23:19 -0700 Subject: [PATCH 28/69] Restore dev version (#3417) --- composer/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/_version.py b/composer/_version.py index 1e088461f8..a38b61a722 100644 --- a/composer/_version.py +++ b/composer/_version.py @@ -3,4 +3,4 @@ """The Composer Version.""" -__version__ = '0.23.3' +__version__ = '0.24.0.dev0' From 94f1ec16b5ffd665bac0271034cefe5545cf4e2d Mon Sep 17 00:00:00 2001 From: Evan Racah Date: Thu, 20 Jun 2024 17:54:21 -0700 Subject: [PATCH 29/69] Save checkpoint to disk for API with new save layout (#3399) --- composer/callbacks/checkpoint_saver.py | 3 +- composer/checkpoint/save.py | 284 ++++++++++++++++++++++++- composer/checkpoint/state_dict.py | 2 +- composer/utils/checkpoint.py | 1 + tests/checkpoint/helpers.py | 71 ++++++- tests/checkpoint/test_save.py | 151 ++++++++++++- tests/checkpoint/test_state_dict.py | 56 +---- 7 files changed, 507 insertions(+), 61 deletions(-) diff --git a/composer/callbacks/checkpoint_saver.py b/composer/callbacks/checkpoint_saver.py index 263558fc2b..c17b874c21 100644 --- a/composer/callbacks/checkpoint_saver.py +++ b/composer/callbacks/checkpoint_saver.py @@ -30,6 +30,7 @@ is_model_deepspeed, partial_format, ) +from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME from composer.utils.compression import get_compressor, is_compressed_pt from composer.utils.object_store.mlflow_object_store import MLFLOW_EXPERIMENT_ID_FORMAT_KEY, MLFLOW_RUN_ID_FORMAT_KEY @@ -37,8 +38,6 @@ __all__ = ['CheckpointSaver'] -_TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME = '.metadata' - class CheckpointSaver(Callback): # noqa: D101 __doc__ = f"""Callback to save checkpoints. diff --git a/composer/checkpoint/save.py b/composer/checkpoint/save.py index 72e5311d0f..03166d8802 100644 --- a/composer/checkpoint/save.py +++ b/composer/checkpoint/save.py @@ -3,12 +3,15 @@ """Useful functions for saving state dicts to disk.""" +import json import logging import os +import pickle import textwrap import warnings +from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional, Sequence, Union import torch import torch.distributed.checkpoint as DCP @@ -16,6 +19,275 @@ from torch.distributed._shard.sharded_tensor import ShardedTensor from torch.distributed._tensor import DTensor +from composer.checkpoint.state_dict import ( + get_metadata_state_dict, + get_model_state_dict, + get_optim_state_dict, + get_resumption_state_dict, +) +from composer.core import State, Time +from composer.devices import Device +from composer.models import ComposerModel +from composer.utils import dist +from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME, _write_checkpoint_file +from composer.utils.file_helpers import format_name_with_dist_and_time + +log = logging.getLogger(__name__) + +MODEL_CHECKPOINT_DIRECTORY_NAME = 'model' +MONOLITHIC_MODEL_CHECKPOINT_FILENAME = 'model.pt' +OPTIM_CHECKPOINT_DIRECTORY_NAME = 'optim' +OPTIM_MONO_CHECKPOINT_FILENAME = 'optim.pt' +METADATA_CHECKPOINT_FILENAME = 'composer_metadata.json' +RESUMPTION_CHECKPOINT_FILENAME = 'resumption.pkl' + + +@dataclass +class CheckpointSaveOptions: + """Options for saving a checkpoint to disk. + + Args: + destination_dir (str): The directory to save the checkpoint to. + save_frequency (Union[str, int, Time]): The frequency to save the checkpoint. + If '1ep', the checkpoint will be saved after each epoch. + If '1ba', the checkpoint will be saved after each batch. + If an int, the checkpoint will be saved after that many epochs. + dir_prefix (str): The prefix to use for the directory name. Can include {epoch} and {batch}. + overwrite (bool): Whether to overwrite the checkpoint if it already exists. + save_model (bool): Whether to save the model. + save_optimizer (bool): Whether to save the optimizer. + save_resumption_state (bool): Whether to save the resumption state. + num_checkpoints_to_keep (int): The number of checkpoints to keep. + If -1, all checkpoints will be kept. + save_format (str): The format to save the model in. 'pt', which is the standard pytorch serializarion, is the only option for now. + sharded_checkpoint (bool): Whether to save the model as a sharded checkpoint. + precision (str): The precision to save the model in. One of 'bf16', 'fp32', 'fp16', 'fp64'. + include_keys (Optional[Union[str, Sequence[str]]]): Keys to include in the saved model. + ignore_keys (Optional[Union[str, Sequence[str]]]): Keys to ignore in the saved model. + """ + destination_dir: str + save_frequency: Union[str, int, Time] = '1ep' + dir_prefix: str = 'ep{epoch}-ba{batch}' + overwrite: bool = False + save_model: bool = True + save_optimizer: bool = True + save_resumption_state: bool = True + num_checkpoints_to_keep: int = -1 + save_format: str = 'pt' + sharded_checkpoint: bool = False + precision: str = 'bf16' + include_keys: Optional[Union[str, Sequence[str]]] = None + ignore_keys: Optional[Union[str, Sequence[str]]] = None + + +def save_checkpoint_to_disk( + state: State, + options: Optional[Union[CheckpointSaveOptions, Dict]] = None, + destination_dir: Optional[str] = None, +): + """Saves a checkpoint to disk. + + Args: + state (State): The state to save. + options (Optional[Union[CheckpointSaveOptions, Dict]]): The options for saving the checkpoint. + If None, destination_dir must be provided. + destination_dir (Optional[str]): The directory to save the checkpoint to. + If options is provided, this will overwrite options.destination_dir. + """ + if options is None: + if destination_dir is None: + raise ValueError('destination_dir must be provided if options is None') + options = CheckpointSaveOptions(destination_dir=destination_dir) + else: + if isinstance(options, Dict): + options = CheckpointSaveOptions(**options) + if destination_dir is not None: + options.destination_dir = destination_dir + save_path = os.path.join(options.destination_dir, options.dir_prefix) + save_path = format_name_with_dist_and_time(save_path, state.run_name, state.timestamp) + os.makedirs(save_path, exist_ok=True) + if options.save_model: + save_model_to_disk( + state.model, + save_path, + options.sharded_checkpoint, + options.precision, + options.include_keys, + options.ignore_keys, + options.overwrite, + options.save_format, + ) + if options.save_optimizer: + optimizer = state.optimizers[0] + save_optim_to_disk( + state.model, + optimizer, + save_path, + options.sharded_checkpoint, + options.precision, + options.overwrite, + options.save_format, + ) + if options.save_resumption_state: + save_resumption_state_to_disk(state, save_path) + + save_composer_metadata_to_disk( + save_path, + state.model, + options.sharded_checkpoint, + options.precision, + state.device, + state.device_train_microbatch_size, + ) + + +def save_model_to_disk( + model: Union[ComposerModel, torch.nn.Module], + destination_dir: str, + sharded_checkpoint: bool = False, + precision: str = 'fp32', + include_keys: Optional[Union[str, Sequence[str]]] = None, + ignore_keys: Optional[Union[str, Sequence[str]]] = None, + overwrite: bool = False, + save_format: str = 'pt', # or hf, safetensor +) -> Optional[str]: + """Saves a model to disk. + + Args: + model (Union[ComposerModel, torch.nn.Module]): The model to save. + destination_dir (str): The directory to save the model to. + Model will be saved as distination_dir/models/model.pt if sharded_checkpoint is False, + otherwise all shards will be saved as destination_dir/models/___0.distcp. + sharded_checkpoint (bool): Whether to save the model as a sharded checkpoint. + precision (str): The precision to save the model in. One of 'bf16', 'fp32', 'fp16', 'fp64'. + include_keys (Optional[Union[str, Sequence[str]]]): Keys to include in the saved model. + ignore_keys (Optional[Union[str, Sequence[str]]]): Keys to ignore in the saved model. + overwrite (bool): If True, the file will be overwritten if it exists. + save_format (str): The format to save the model in. One of 'pt', 'hf', or 'safetensor'. + + Returns: + str: The full path to the saved model. + """ + if save_format != 'pt': + raise NotImplementedError( + f"Saving checkpoint in format {save_format} is not supported. Please choose from ['pt'].", + ) + model_state_dict = get_model_state_dict( + model, + sharded_checkpoint, + precision, + include_keys, + ignore_keys, + ) + + destination_file_path = ( + os.path.join(destination_dir, MODEL_CHECKPOINT_DIRECTORY_NAME) if sharded_checkpoint else + os.path.join(destination_dir, MODEL_CHECKPOINT_DIRECTORY_NAME, MONOLITHIC_MODEL_CHECKPOINT_FILENAME) + ) + saved_path = save_state_dict_to_disk( + state_dict=model_state_dict, + destination_file_path=destination_file_path, + overwrite=overwrite, + save_format=save_format, + ) + return saved_path + + +def save_optim_to_disk( + model: Union[ComposerModel, torch.nn.Module], + optimizer: torch.optim.Optimizer, + destination_dir: str, + sharded_checkpoint: bool = False, + precision: str = 'fp32', + overwrite: bool = False, + save_format: str = 'pt', +) -> Optional[str]: + """Saves an optimizer to disk. + + Args: + model (Union[ComposerModel, torch.nn.Module]): The model to save. + optimizer (torch.optim.Optimizer): The optimizer to save. + destination_dir (str): The directory to save the optimizer to. + Optimizer will be saved as destination_dir/optim/optim.pt if sharded_checkpoint is False, + otherwise all shards will be saved as destination_dir/optim/___0.distcp. + sharded_checkpoint (bool): Whether to save the optimizer as a sharded checkpoint. + precision (str): The precision to save the optimizer in. One of 'bf16', 'fp32', 'fp16', 'fp64'. + overwrite (bool): If True, the file will be overwritten if it exists. + save_format (str): The format to save the optimizer in. One of 'pt'. + """ + optim_state_dict = get_optim_state_dict( + model, + optimizer, + sharded_state_dict=sharded_checkpoint, + precision=precision, + ) + destination_file_path = os.path.join(destination_dir, + OPTIM_CHECKPOINT_DIRECTORY_NAME) if sharded_checkpoint else os.path.join( + destination_dir, + OPTIM_CHECKPOINT_DIRECTORY_NAME, + OPTIM_MONO_CHECKPOINT_FILENAME, + ) + saved_path = save_state_dict_to_disk( + state_dict=optim_state_dict, + destination_file_path=destination_file_path, + overwrite=overwrite, + save_format=save_format, + ) + + return saved_path + + +def save_composer_metadata_to_disk( + destination_dir: str, + model: Optional[Union[ComposerModel, torch.nn.Module]] = None, + sharded_state_dict: Optional[bool] = None, + precision: Optional[Union[str, torch.dtype]] = None, + device: Optional[Device] = None, + device_train_microbatch_size: Optional[Union[int, float]] = None, +): + """Saves metadata about the model to disk. + + Args: + destination_dir (str): The directory to save the metadata to. + model (Optional[Union[ComposerModel, torch.nn.Module]]): The model to save metadata about. + sharded_state_dict (Optional[bool]): Whether the model is sharded. + precision (Optional[Union[str, torch.dtype]]): The precision of the model. + device (Optional[Device]): The device the model is on. + device_train_microbatch_size (Optional[Union[int, float]]): The device train microbatch size. + """ + md_dict = get_metadata_state_dict( + model, + sharded_state_dict, + precision, + device, + device_train_microbatch_size, + ) + os.makedirs(destination_dir, exist_ok=True) + destination_file_path = os.path.join(destination_dir, METADATA_CHECKPOINT_FILENAME) + + if dist.get_global_rank() == 0: + with open(destination_file_path, 'w') as f: + json.dump(md_dict, f, indent=4) + return destination_file_path + + +def save_resumption_state_to_disk( + state: State, + destination_dir: str, +): + """Saves the resumption state to disk. + + Args: + state (State): The state to save. + destination_dir (str): The directory to save the resumption state to. + """ + resumption_state_dict = get_resumption_state_dict(state) + destination_file_path = os.path.join(destination_dir, RESUMPTION_CHECKPOINT_FILENAME) + with open(destination_file_path, 'wb') as f: + pickle.dump(resumption_state_dict, f) + return destination_file_path + + from composer.utils import dist from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME, _write_checkpoint_file @@ -80,6 +352,8 @@ def _save_sharded_state_dict_to_disk( ) destination_file_path = stripped_path + # Wait for all ranks to get here before checking if the directory exists. + dist.barrier() if dist.get_global_rank() == 0 and not overwrite and os.path.exists(destination_file_path): raise ValueError(f'Directory {destination_file_path} already exists. Set overwrite=True to overwrite it.') @@ -94,6 +368,9 @@ def _save_sharded_state_dict_to_disk( else: DCP.save(state_dict=state_dict, storage_writer=DCP.FileSystemWriter(destination_file_path)) + log.debug( + f'Finished saving of sharded state dict to {destination_file_path}/{_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME}', + ) return destination_file_path + '/' + _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME @@ -106,13 +383,14 @@ def _save_full_state_dict_to_disk( if save_format != 'pt': raise NotImplementedError( - f"Saving sharded state dict to disk in format {save_format} is not supported. Please choose from ['pt'].", + f"Saving full state dict to disk in format {save_format} is not supported. Please choose from ['pt'].", ) if not overwrite and os.path.exists(destination_file_path): raise ValueError(f'File {destination_file_path} already exists. Set overwrite=True to overwrite it.') if dist.get_global_rank() == 0: + os.makedirs(os.path.dirname(destination_file_path), exist_ok=True) _write_checkpoint_file(state_dict=state_dict, filename=destination_file_path) return destination_file_path return None @@ -130,7 +408,7 @@ def is_state_dict_sharded(state_dict: Dict[str, Any]) -> bool: for value in state_dict.values(): if isinstance(value, ShardedTensor) or isinstance(value, DTensor): return True - if isinstance(value, Dict): + elif isinstance(value, Dict): is_sharded = is_state_dict_sharded(value) if is_sharded: return True diff --git a/composer/checkpoint/state_dict.py b/composer/checkpoint/state_dict.py index a20baaf165..5f82836d7b 100644 --- a/composer/checkpoint/state_dict.py +++ b/composer/checkpoint/state_dict.py @@ -380,7 +380,7 @@ def get_metadata_state_dict( sharded_state_dict: Optional[bool] = None, precision: Optional[Union[str, torch.dtype]] = None, device: Optional[Device] = None, - device_train_microbatch_size: Optional[int] = None, + device_train_microbatch_size: Optional[Union[int, float]] = None, ) -> dict[str, Any]: """Generate the metadata and integrations for a training run. diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py index f2342eeb4c..f9ad516724 100644 --- a/composer/utils/checkpoint.py +++ b/composer/utils/checkpoint.py @@ -53,6 +53,7 @@ _COMPOSER_STATES_FILENAME = 'composer_states.pt' _DEEPSPEED_TAG = 'deepspeed' # always tag with the same, deterministic name. We'll rename the tarball to the appropriate name. _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME = f'__{dist.get_global_rank()}_0.distcp' +_TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME = '.metadata' def _get_checkpoint_validation_function( diff --git a/tests/checkpoint/helpers.py b/tests/checkpoint/helpers.py index 047d30e813..4915c3a150 100644 --- a/tests/checkpoint/helpers.py +++ b/tests/checkpoint/helpers.py @@ -1,24 +1,85 @@ # Copyright 2024 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict +from typing import Any, Dict, Tuple, Union +from unittest.mock import MagicMock import torch +from packaging import version from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.distributed.fsdp.api import CPUOffload from torch.optim import adam - +from torch.optim.lr_scheduler import StepLR +from torch.utils.data import DataLoader + +from composer.algorithms import SWA +from composer.callbacks import SpeedMonitor +from composer.core import State +from composer.devices import Device, DeviceCPU, DeviceGPU +from composer.models import ComposerModel from tests.common.models import EvenSimplerMLP, SimpleComposerMLP __all__ = [ 'init_model_and_optimizer', 'init_model', 'init_optimizer', + 'init_state', ] +def init_state( + use_fsdp: bool = False, + device: str = 'cpu', + include_schedulers=False, + include_callbacks=False, + include_algorithms=False, + use_grad_scaler=False, + rank_zero_seed=10, + run_name='test_run', + take_step=False, +) -> State: + model, optimizer = init_model_and_optimizer( + use_fsdp=use_fsdp, + use_composer_model=True, + take_step=take_step, + device=device, + ) + + test_dataset_sd = {'test': 0} + device_obj: Device = DeviceCPU() if device == 'cpu' else DeviceGPU() + + dataloader = MagicMock(spec=DataLoader) + dataloader.dataset = MagicMock() + dataloader.dataset.state_dict = MagicMock(return_value=test_dataset_sd) + kwargs = {} + + if include_callbacks: + kwargs['callbacks'] = [SpeedMonitor(), SpeedMonitor()] + if include_algorithms: + kwargs['algorithms'] = [SWA()] + if use_grad_scaler: + if version.parse(torch.__version__) >= version.parse('2.3.0'): + from torch.amp.grad_scaler import GradScaler + else: + from torch.cuda.amp.grad_scaler import GradScaler + kwargs['scaler'] = GradScaler() + + state = State( + model=model, + rank_zero_seed=rank_zero_seed, + run_name=run_name, + device=device_obj, + train_dataloader=dataloader, + optimizers=[optimizer], + **kwargs, + ) + if include_schedulers: + state.schedulers = StepLR(optimizer=optimizer, step_size=2) + return state + + def init_model_and_optimizer( - use_composer_model: bool, + use_composer_model: bool = True, num_classes=3, batch_size=5, num_features=8, @@ -26,7 +87,7 @@ def init_model_and_optimizer( use_fsdp=False, tensor_type='sharded_tensor', device='cuda', -): +) -> Tuple[Union[ComposerModel, torch.nn.Module], torch.optim.Optimizer]: model, loss_fn = init_model( use_composer_model, num_classes=num_classes, @@ -59,7 +120,7 @@ def init_model( tensor_type='sharded_tensor', sync_module_states=True, cpu_offload=False, -): +) -> Tuple[Union[ComposerModel, torch.nn.Module], Any]: if use_composer_model: model = SimpleComposerMLP(num_features=num_features, num_classes=num_classes, device=device) loss_fn = model._loss_fn diff --git a/tests/checkpoint/test_save.py b/tests/checkpoint/test_save.py index 03b12bbcbc..f4d41cc09d 100644 --- a/tests/checkpoint/test_save.py +++ b/tests/checkpoint/test_save.py @@ -1,6 +1,7 @@ # Copyright 2024 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 +import json import os import time import uuid @@ -12,15 +13,157 @@ import torch.distributed.checkpoint as DCP from packaging import version -from composer.checkpoint.save import save_state_dict_to_disk -from composer.checkpoint.state_dict import get_model_state_dict +from composer.checkpoint.save import ( + save_checkpoint_to_disk, + save_composer_metadata_to_disk, + save_model_to_disk, + save_optim_to_disk, + save_state_dict_to_disk, +) +from composer.checkpoint.state_dict import get_model_state_dict, get_optim_state_dict +from composer.core import Timestamp from composer.utils import dist -from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME -from tests.checkpoint.helpers import init_model +from composer.utils.checkpoint import ( + _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME, + _TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME, +) +from tests.checkpoint.helpers import init_model, init_model_and_optimizer, init_state from tests.common.compare import deep_compare from tests.common.markers import world_size +@pytest.mark.gpu +@pytest.mark.parametrize( + 'world_size,sharded_model,sharded_checkpoint', + [ + pytest.param(1, False, False, marks=pytest.mark.world_size(1)), + pytest.param(2, True, True, marks=pytest.mark.world_size(2)), + pytest.param(2, True, False, marks=pytest.mark.world_size(2)), + ], +) +@pytest.mark.filterwarnings('ignore::UserWarning') +def test_save_checkpoint_to_disk(world_size: int, tmp_path: str, sharded_model: bool, sharded_checkpoint: bool): + destination_dir = os.path.join(tmp_path, str(uuid.uuid4())[:8]) + destination_dir = dist.all_gather_object(destination_dir)[0] + save_options = { + 'destination_dir': destination_dir, + 'save_model': True, + 'save_optimizer': True, + 'save_resumption_state': True, + 'sharded_checkpoint': sharded_checkpoint, + 'dir_prefix': 'ep{epoch}-ba{batch}', + } + state = init_state(use_fsdp=sharded_model, device='cuda', take_step=True) + state.run_name = 'foo' + state.timestamp = Timestamp() + expected_destination_dir = os.path.join(destination_dir, 'ep0-ba0') + save_checkpoint_to_disk(state, save_options) + expected_model_dir = os.path.join(expected_destination_dir, 'model') + expected_optim_dir = os.path.join(expected_destination_dir, 'optim') + expected_metadata_filepath = os.path.join(expected_destination_dir, 'composer_metadata.json') + expected_resumption_filepath = os.path.join(expected_destination_dir, 'resumption.pkl') + if sharded_checkpoint: + checkpoint_filenames = dist.all_gather_object(_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME) + for checkpoint_filename in checkpoint_filenames: + assert os.path.exists(os.path.join(expected_model_dir, checkpoint_filename)) + assert os.path.exists(os.path.join(expected_optim_dir, checkpoint_filename)) + assert os.path.exists(os.path.join(expected_model_dir, _TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME)) + assert os.path.exists(os.path.join(expected_optim_dir, _TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME)) + else: + assert os.path.exists(os.path.join(expected_model_dir, 'model.pt')) + assert os.path.exists(os.path.join(expected_optim_dir, 'optim.pt')) + + import time + + # Need to wait for the file to be written to avoid flaky test. + time.sleep(0.2) + assert os.path.exists(expected_metadata_filepath) + assert os.path.exists(expected_resumption_filepath) + + +def test_save_composer_metadata_to_disk(tmp_path: str): + destination_dir = os.path.join(tmp_path, str(uuid.uuid4())[:8]) + destination_dir = dist.all_gather_object(destination_dir)[0] + save_composer_metadata_to_disk(destination_dir) + expected_file_path = os.path.join(destination_dir, 'composer_metadata.json') + assert os.path.exists(expected_file_path) + json.load(open(expected_file_path, 'r')) + + +@pytest.mark.gpu +@pytest.mark.parametrize( + 'world_size,sharded_optimizer,sharded_checkpoint', + [ + pytest.param(1, False, False, marks=pytest.mark.world_size(1)), + pytest.param(2, True, True, marks=pytest.mark.world_size(2)), + pytest.param(2, True, False, marks=pytest.mark.world_size(2)), + ], +) +def test_save_optim_to_disk(world_size: int, tmp_path: str, sharded_optimizer: bool, sharded_checkpoint: bool): + destination_dir = os.path.join(tmp_path, str(uuid.uuid4())[:8]) + # Sync the path across all ranks + destination_dir = dist.all_gather_object(destination_dir)[0] + use_fsdp = sharded_optimizer + model, optim = init_model_and_optimizer(use_fsdp=use_fsdp, device='cuda') + optim_state_dict = get_optim_state_dict(model, optimizer=optim, sharded_state_dict=sharded_checkpoint) + optim_state_dict_saved = deepcopy(optim_state_dict) + save_optim_to_disk(model, optim, destination_dir=destination_dir, sharded_checkpoint=sharded_checkpoint) + + # Load new optim from disk + model, optim = init_model_and_optimizer(use_fsdp=use_fsdp, device='cuda') + cur_state_dict = get_optim_state_dict(model, optimizer=optim, sharded_state_dict=sharded_checkpoint) + + if sharded_checkpoint: + expected_file_path = os.path.join(destination_dir, 'optim') + if version.parse(torch.__version__) < version.parse('2.2.0'): + DCP.load_state_dict(state_dict=cur_state_dict, storage_reader=DCP.FileSystemReader(expected_file_path)) + else: + DCP.load(state_dict=cur_state_dict, storage_reader=DCP.FileSystemReader(expected_file_path)) + else: + if dist.get_global_rank() == 0: + expected_file_path = os.path.join(destination_dir, 'optim', 'optim.pt') + cur_state_dict = torch.load(expected_file_path, map_location='cuda') + + deep_compare(optim_state_dict_saved, cur_state_dict) + + +@pytest.mark.gpu +@pytest.mark.parametrize( + 'world_size,sharded_model,sharded_checkpoint', + [ + pytest.param(1, False, False, marks=pytest.mark.world_size(1)), + pytest.param(2, True, True, marks=pytest.mark.world_size(2)), + pytest.param(2, True, False, marks=pytest.mark.world_size(2)), + ], +) +def test_save_model_to_disk(world_size: int, tmp_path: str, sharded_model: bool, sharded_checkpoint: bool): + destination_dir = os.path.join(tmp_path, str(uuid.uuid4())[:8]) + # Sync the path across all ranks + destination_dir = dist.all_gather_object(destination_dir)[0] + use_fsdp = sharded_model + model, _ = init_model(use_fsdp=use_fsdp, device='cuda', sync_module_states=True) + state_dict = get_model_state_dict(model, sharded_state_dict=sharded_checkpoint) + state_dict_saved = deepcopy(state_dict) + save_model_to_disk(model, destination_dir=destination_dir, sharded_checkpoint=sharded_checkpoint) + + # Load new model from disk + new_model, _ = init_model(use_fsdp=use_fsdp, device='cuda', sync_module_states=True) + cur_state_dict = get_model_state_dict(new_model, sharded_state_dict=sharded_checkpoint) + + if sharded_checkpoint: + expected_file_path = os.path.join(destination_dir, 'model') + if version.parse(torch.__version__) < version.parse('2.2.0'): + DCP.load_state_dict(state_dict=cur_state_dict, storage_reader=DCP.FileSystemReader(expected_file_path)) + else: + DCP.load(state_dict=cur_state_dict, storage_reader=DCP.FileSystemReader(expected_file_path)) + else: + if dist.get_global_rank() == 0: + expected_file_path = os.path.join(destination_dir, 'model', 'model.pt') + cur_state_dict = torch.load(expected_file_path, map_location='cuda') + + deep_compare(state_dict_saved, cur_state_dict) + + @world_size(1, 2) @pytest.mark.gpu @pytest.mark.parametrize('sharded_model', [False, True]) diff --git a/tests/checkpoint/test_state_dict.py b/tests/checkpoint/test_state_dict.py index 4f719254a7..12fde27249 100644 --- a/tests/checkpoint/test_state_dict.py +++ b/tests/checkpoint/test_state_dict.py @@ -3,27 +3,21 @@ import datetime from typing import Any -from unittest.mock import MagicMock import pytest import torch from packaging import version from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.optim.lr_scheduler import StepLR -from torch.utils.data import DataLoader -from composer.algorithms import SWA -from composer.callbacks import SpeedMonitor from composer.checkpoint import ( get_metadata_state_dict, get_model_state_dict, get_optim_state_dict, get_resumption_state_dict, ) -from composer.core import State -from composer.devices import DeviceCPU, DeviceGPU +from composer.devices import DeviceGPU from composer.utils import dist, reproducibility -from tests.checkpoint.helpers import init_model_and_optimizer +from tests.checkpoint.helpers import init_model_and_optimizer, init_state from tests.common.compare import deep_compare from tests.common.markers import world_size from tests.common.models import EvenSimplerMLP, SimpleComposerMLP, configure_tiny_gpt2_hf_model @@ -444,27 +438,17 @@ def test_get_metadata_sharded_model(model_type: str, tensor_type: str, world_siz @pytest.mark.filterwarnings('ignore:SWA has') def test_get_resumption_state_dict(): - - model, optimizer = init_model_and_optimizer(use_composer_model=True, take_step=True, device='cpu') - - rank_zero_seed = 10 run_name = 'test_run' - device = DeviceCPU() - test_dataset_sd = {'foo': 0} - dataloader = MagicMock(spec=DataLoader) - dataloader.dataset = MagicMock() - dataloader.dataset.state_dict = MagicMock(return_value=test_dataset_sd) - swa = SWA() - state = State( - model=model, + rank_zero_seed = 10 + state = init_state( + device='cpu', + include_algorithms=True, + include_callbacks=True, + include_schedulers=True, rank_zero_seed=rank_zero_seed, run_name=run_name, - device=device, - train_dataloader=dataloader, - algorithms=[swa], - callbacks=[SpeedMonitor(), SpeedMonitor()], ) - state.schedulers = StepLR(optimizer=optimizer, step_size=2) + test_dataset_sd = {'test': 0} rsd = get_resumption_state_dict(state) assert rsd['rank_zero_seed'] == rank_zero_seed @@ -505,27 +489,7 @@ def test_get_resumption_state_dict(): @pytest.mark.gpu def test_get_resumption_state_dict_gpu(): - if version.parse(torch.__version__) >= version.parse('2.3.0'): - from torch.amp.grad_scaler import GradScaler - else: - from torch.cuda.amp.grad_scaler import GradScaler - - model, _ = init_model_and_optimizer(use_composer_model=True, take_step=False, device='cuda') - - rank_zero_seed = 10 - run_name = 'test_run' - device = DeviceCPU() - test_dataset_sd = {'test': 0} - dataloader = MagicMock() - dataloader.dataset = MagicMock() - dataloader.dataset.state_dict = MagicMock(return_value=test_dataset_sd) - state = State( - model=model, - rank_zero_seed=rank_zero_seed, - run_name=run_name, - device=device, - scaler=GradScaler(), - ) + state = init_state(device='cuda', use_grad_scaler=True) rsd = get_resumption_state_dict(state) assert 'scaler' in rsd assert set( From d420765ba09ecf6c2965a18e720447d594db4065 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Thu, 20 Jun 2024 21:04:09 -0700 Subject: [PATCH 30/69] fix typing (#3419) --- composer/trainer/_patch_pytorch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py index 6771c5db4b..3f19df7d2a 100644 --- a/composer/trainer/_patch_pytorch.py +++ b/composer/trainer/_patch_pytorch.py @@ -933,7 +933,8 @@ def device_mesh__getitem__(self, mesh_dim_names: Union[str, tuple[str]]) -> 'Dev return submesh else: - from torch.distributed.device_mesh import _mesh_resources + from torch.utils._typing_utils import not_none + from torch.distributed.device_mesh import DeviceMesh, _mesh_resources def create_child_mesh( self, parent_mesh: 'DeviceMesh', submesh_dim_names: Tuple[str, ...], From ba1789789510bc0d6705b473b338998a2b22d324 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Thu, 20 Jun 2024 22:35:49 -0700 Subject: [PATCH 31/69] Fixes some typing issues (#3418) --- composer/callbacks/eval_output_logging_callback.py | 4 ++++ composer/core/evaluator.py | 10 +++++----- composer/loggers/mlflow_logger.py | 4 +++- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/composer/callbacks/eval_output_logging_callback.py b/composer/callbacks/eval_output_logging_callback.py index fd52b33960..717994413a 100644 --- a/composer/callbacks/eval_output_logging_callback.py +++ b/composer/callbacks/eval_output_logging_callback.py @@ -114,6 +114,10 @@ def eval_batch_end(self, state: State, logger: Logger) -> None: self.rows.extend(rows) def eval_end(self, state: State, logger: Logger) -> None: + # eval_batch_end will have set these if there is anything to log + if self.name is None or self.columns is None: + return + list_of_rows = dist.all_gather_object(self.rows) rows = [row for rows in list_of_rows for row in rows] for dest_logger in logger.destinations: diff --git a/composer/core/evaluator.py b/composer/core/evaluator.py index 767131bc35..d1ef6c947e 100644 --- a/composer/core/evaluator.py +++ b/composer/core/evaluator.py @@ -67,7 +67,7 @@ class Evaluator: When specifying ``eval_interval``, the evaluator(s) are also run at the ``Event.FIT_END`` if it doesn't evenly divide the training duration. - device_eval_microbatch_size (int, optional): The number of samples to use for each microbatch when evaluating. + device_eval_microbatch_size (str | int | float, optional): The number of samples to use for each microbatch when evaluating. If set to ``auto``, dynamically decreases device_eval_microbatch_size if microbatch is too large for GPU. If None, sets `device_eval_microbatch_size` to per rank batch size. (default: ``None``) """ @@ -80,7 +80,7 @@ def __init__( metric_names: Optional[list[str]] = None, subset_num_batches: Optional[int] = None, eval_interval: Optional[Union[int, str, Time, Callable[[State, Event], bool]]] = None, - device_eval_microbatch_size: Optional[Union[int, str]] = None, + device_eval_microbatch_size: Optional[Union[int, str, float]] = None, ): self.label = label self.dataloader = ensure_data_spec(dataloader) @@ -142,7 +142,7 @@ def ensure_evaluator(evaluator: Union[Evaluator, DataSpec, Iterable, dict[str, A ) -def _is_auto_microbatching(device_eval_microbatch_size: Optional[Union[int, str]]): +def _is_auto_microbatching(device_eval_microbatch_size: Optional[Union[int, str, float]]): if device_eval_microbatch_size == 'auto': warnings.warn(( "Setting `device_eval_microbatch_size='auto'` is an experimental feature which may cause " @@ -155,10 +155,10 @@ def _is_auto_microbatching(device_eval_microbatch_size: Optional[Union[int, str] def _get_initial_device_eval_microbatch_size( - device_eval_microbatch_size: Optional[Union[int, str]], + device_eval_microbatch_size: Optional[Union[int, str, float]], auto_microbatching: bool, dataloader: Iterable, -) -> int: +) -> Union[int, float]: """Sets initial value of device_eval_microbatch_size. If auto_microbatching, sets initial `device_eval_microbatch_size` to per rank batch size. diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py index aed32eea39..526a7962fd 100644 --- a/composer/loggers/mlflow_logger.py +++ b/composer/loggers/mlflow_logger.py @@ -185,6 +185,9 @@ def __init__( def _start_mlflow_run(self, state): import mlflow + # This function is only called if self._enabled is True, and therefore self._experiment_id is not None. + assert self._experiment_id is not None + env_run_id = os.getenv( mlflow.environment_variables.MLFLOW_RUN_ID.name, # pyright: ignore[reportGeneralTypeIssues] None, @@ -193,7 +196,6 @@ def _start_mlflow_run(self, state): self._run_id = env_run_id elif self.resume: # Search for an existing run tagged with this Composer run if `self.resume=True`. - assert self._experiment_id is not None run_name = self.tags['run_name'] existing_runs = mlflow.search_runs( experiment_ids=[self._experiment_id], From 4e8ed2eaee03f2b2ed99a3158489f97c3e12a370 Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Fri, 21 Jun 2024 16:02:24 -0400 Subject: [PATCH 32/69] Fix small things (#3420) --- composer/core/state.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/composer/core/state.py b/composer/core/state.py index fa4feaec75..a1bb14f0af 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -759,7 +759,7 @@ def _iteration_length(self): def _iteration_length(self, iteration_length: Optional[Union[str, Time[int]]]): """Sets the length of an iteration. - An iteration must be defined as multiple epochs. See composer/core/event.py. + An iteration must be defined as multiple epochs or tokens. See composer/core/event.py. """ if iteration_length is None: self.__iteration_length = None @@ -777,7 +777,7 @@ def stop_training(self): logging, and evaluation for that batch, as well as any epoch end events. """ # Set the max_duration to the current time in its unit, except if the unit is TimeUnit.EPOCH. This is because TimeUnit.EPOCH is a very crude way to measure max duration. For example, it will result in division by zero error while computing get_elapsed_duration: https://github.com/mosaicml/composer/blob/1b9c6d3c0592183b947fd89890de0832366e33a7/composer/core/state.py#L641 - if self.max_duration is not None and Time.from_input(self.max_duration,).unit != TimeUnit.EPOCH: + if self.max_duration is not None and Time.from_input(self.max_duration).unit != TimeUnit.EPOCH: max_duration_unit = Time.from_input(self.max_duration).unit self.max_duration = self.timestamp.get(max_duration_unit) else: From 5ba56acf040fd676def4780fcd201e76649eb96d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Jun 2024 07:52:48 -0700 Subject: [PATCH 33/69] Bump coverage[toml] from 7.5.3 to 7.5.4 (#3422) Bumps [coverage[toml]](https://github.com/nedbat/coveragepy) from 7.5.3 to 7.5.4. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.5.3...7.5.4) --- updated-dependencies: - dependency-name: coverage[toml] dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 29f7a8466b..dc0be75cd3 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def package_files(prefix: str, directory: str, extension: str): # Should manually update dependency versions occassionally. 'custom_inherit==2.4.1', 'junitparser==3.1.2', - 'coverage[toml]==7.5.3', + 'coverage[toml]==7.5.4', 'fasteners==0.18', # object store tests require fasteners 'pytest==7.4.4', 'ipython==8.11.0', From abfd78c3a380b92ff26d842c9f3ca0d72cefb5f7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Jun 2024 08:31:41 -0700 Subject: [PATCH 34/69] Update psutil requirement from <6,>=5.8.0 to >=5.8.0,<7 (#3424) Updates the requirements on [psutil](https://github.com/giampaolo/psutil) to permit the latest version. - [Changelog](https://github.com/giampaolo/psutil/blob/master/HISTORY.rst) - [Commits](https://github.com/giampaolo/psutil/compare/release-5.8.0...release-6.0.0) --- updated-dependencies: - dependency-name: psutil dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Mihir Patel --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index dc0be75cd3..0afe4ed314 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,7 @@ def package_files(prefix: str, directory: str, extension: str): 'torch>=2.1.2,<2.3.2', 'requests>=2.26.0,<3', 'numpy>=1.21.5,<2.1.0', - 'psutil>=5.8.0,<6', + 'psutil>=5.8.0,<7', 'coolname>=1.1.0,<3', 'tabulate==0.9.0', # for auto-generating tables 'py-cpuinfo>=8.0.0,<10', From d3e95a92ac8fa37914ff67eac32ef43a48fdbc5f Mon Sep 17 00:00:00 2001 From: Joe Early Date: Mon, 24 Jun 2024 19:45:54 +0100 Subject: [PATCH 35/69] Add support for variable length dataloaders in DDP (#3416) * Add support for variable length dataloaders in dist training * Remove test file * Fix typo * Fixed batch referenced before assignment * Replace sentinel with None * Add unit test * Update unit test * Reduce tensor creation to one line Co-authored-by: Mihir Patel * Remove requirement for gpu in test --------- Co-authored-by: Mihir Patel --- composer/trainer/trainer.py | 13 ++++++++++++ tests/trainer/test_trainer.py | 37 +++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 4447698beb..91dd0b1e19 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -3640,6 +3640,11 @@ def _iter_dataloader(self, trainer_mode: TrainerMode): else: dataloader_iter = itertools.islice(self.state.dataloader, int(self.state.dataloader_len)) + # Track if iteration has finished (used for distributed training when we have variable length dataloaders) + # 0 = not finished, 1 = finished (using integer tensors so we can use dist.all_reduce) + iter_finished = self.state.device.tensor_to_device(torch.zeros(1, dtype=torch.uint8)) + + batch = None while True: try: # [BEFORE/AFTER]_DATALOADER only runs while training @@ -3655,7 +3660,15 @@ def _iter_dataloader(self, trainer_mode: TrainerMode): # Otherwise, we will encounter an error at the start of the next epoch when # Event.BEFORE_DATALOADER tries to start an unfinished marker. self.engine.run_marker_only_event(Event.AFTER_DATALOADER) + # Mark iteration as finished - don't break yet as we need to sync across ranks + iter_finished += 1 + + # Sync iter finished across ranks + dist.all_reduce(iter_finished, reduce_operation='MAX') + # If any rank has finished, stop all rank iterations + if iter_finished.item() == 1: break + yield batch def _use_closures(self) -> bool: diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 59e8b26782..1bb5d265b6 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1250,6 +1250,43 @@ def test_accumulate_time_across_ranks( assert num_tokens_accum == num_tokens * 2 assert batch_time_accum == datetime.timedelta(seconds=0.1 * (1 + 0)) + @pytest.mark.world_size(2) + def test_rank_dependent_dataloader_lengths( + self, + model: ComposerModel, + max_duration: Time[int], + ): + # Change rank 1 dataloader size to create different sized dataloaders on each rank + batch_size = 4 + orig_num_samples = 16 + rank_num_samples = orig_num_samples + 8 if dist.get_local_rank() == 1 else orig_num_samples + # Create train and eval dataloaders (will have rank-dependent lengths) + train_dataset = RandomClassificationDataset(size=rank_num_samples) + train_dataloader = DataLoader( + dataset=train_dataset, + batch_size=batch_size, + sampler=dist.get_sampler(train_dataset), + ) + eval_dataset = RandomClassificationDataset(size=rank_num_samples) + eval_dataloader = DataLoader( + dataset=eval_dataset, + batch_size=batch_size, + sampler=dist.get_sampler(eval_dataset), + ) + # Fit (train + eval) + trainer = Trainer( + model=model, + max_duration=max_duration, + train_dataloader=train_dataloader, + eval_dataloader=eval_dataloader, + ) + trainer.fit() + # Check the correct number of samples and batches have been processed + assert trainer.state.timestamp.sample.value == orig_num_samples + assert trainer.state.timestamp.batch.value == orig_num_samples / batch_size / 2 + assert trainer.state.eval_timestamp.sample.value == orig_num_samples + assert trainer.state.eval_timestamp.batch.value == orig_num_samples / batch_size / 2 + @world_size(1, 2) @device('cpu', 'gpu', 'gpu-amp', precision=True) From 84c4723108d3e378d20591a51adc82e84a13ccfc Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 24 Jun 2024 18:30:50 -0400 Subject: [PATCH 36/69] Hsdp + MoE CI tests (#3378) * fold ema fsdp state * debug * debug * more debug * keep debugging * debug * sanity check * debug * debug * use ema * debug * debug * debug * debug * debug * debug * more fix * filename test * revert test * fully parameterize * hsdp test * revert testing * typo * typo * hsdp * split off test * precommit * float to int * pyright * oom * print * rm tp * tp cfg * tp? * rm tp line * type annotation * revert * readd tp * type * world size * revert * revert monolithic cpkt + include sharded cpkt * enumerate * precommit * precommit * sharded * sync * only sync on first trainer * typo * hsdp * xfail * explicit sync * test * revert test * sync, docker issue * pre-commit * sync * pytest * xfail * rm world_size param * im so sorry pls forgive me king * the kings comments * Update tests/trainer/test_fsdp_checkpoint.py fix formatting Co-authored-by: Mihir Patel * precommit --------- Co-authored-by: v-chen_data Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> Co-authored-by: Mihir Patel --- tests/trainer/test_fsdp_checkpoint.py | 86 ++++++++++++++++++--------- 1 file changed, 59 insertions(+), 27 deletions(-) diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py index bb99a9287e..2e5fd5d07b 100644 --- a/tests/trainer/test_fsdp_checkpoint.py +++ b/tests/trainer/test_fsdp_checkpoint.py @@ -289,21 +289,21 @@ def _compare_timestamps_between_state_dicts(state_dict1, state_dict2): @pytest.mark.gpu @pytest.mark.filterwarnings(r'ignore:.*scatter_full_optim_state_dict``is being deprecated.*:UserWarning') @pytest.mark.parametrize( - 'world_size,optimizer,autoresume,precision,save_weights_only,load_weights_only,load_monolith_rank0_only,use_tp', + 'optimizer,autoresume,precision,save_weights_only,load_weights_only,load_monolith_rank0_only,use_tp,use_hsdp', [ - pytest.param(2, 'adam', False, 'amp_bf16', False, False, False, False, marks=pytest.mark.world_size(2)), - pytest.param(2, 'adamw', False, 'amp_bf16', False, False, False, False, marks=pytest.mark.world_size(2)), - pytest.param(2, 'adam', True, 'amp_bf16', False, False, False, False, marks=pytest.mark.world_size(2)), - pytest.param(2, 'adam', False, 'amp_fp16', False, False, False, False, marks=pytest.mark.world_size(2)), - pytest.param(2, 'adam', False, 'amp_bf16', True, True, False, False, + pytest.param('adam', False, 'amp_bf16', False, False, False, False, False, marks=pytest.mark.world_size(2)), + pytest.param('adamw', False, 'amp_bf16', False, False, False, False, False, marks=pytest.mark.world_size(2)), + pytest.param('adam', True, 'amp_bf16', False, False, False, False, False, marks=pytest.mark.world_size(2)), + pytest.param('adam', False, 'amp_fp16', False, False, False, False, False, marks=pytest.mark.world_size(2)), + pytest.param('adam', False, 'amp_bf16', True, True, False, False, False, marks=pytest.mark.world_size(2)), # save_weights_only requires load_weights_only - pytest.param(2, 'adam', False, 'amp_bf16', False, True, False, False, marks=pytest.mark.world_size(2)), - pytest.param(2, 'adam', False, 'amp_bf16', False, False, True, False, marks=pytest.mark.world_size(2)), - pytest.param(4, 'adam', False, 'amp_bf16', False, False, False, True, marks=pytest.mark.world_size(4)), + pytest.param('adam', False, 'amp_bf16', False, True, False, False, False, marks=pytest.mark.world_size(2)), + pytest.param('adam', False, 'amp_bf16', False, False, True, False, False, marks=pytest.mark.world_size(2)), + pytest.param('adam', False, 'amp_bf16', False, False, False, True, False, marks=pytest.mark.world_size(4)), + pytest.param('adam', False, 'amp_bf16', False, False, False, False, True, marks=pytest.mark.world_size(4)), ], ) def test_fsdp_full_state_dict_load( - world_size, tmp_path: pathlib.Path, autoresume: bool, precision: str, @@ -312,7 +312,10 @@ def test_fsdp_full_state_dict_load( load_weights_only: bool, load_monolith_rank0_only: bool, use_tp: bool, + use_hsdp: bool, ): + if use_hsdp: + pytest.xfail('Known Pytorch issue with HSDP, waiting for pytorch patch') if autoresume: run_name = 'my-cool-autoresume-run' else: @@ -320,11 +323,20 @@ def test_fsdp_full_state_dict_load( save_folder = tmp_path save_filename = 'rank{rank}.pt' - fsdp_config = FSDPConfig( - sharded_ckpt_prefix_dir='ba{batch}', - sync_module_states=load_monolith_rank0_only, - load_monolith_rank0_only=load_monolith_rank0_only, - ) + if use_hsdp: + fsdp_config = FSDPConfig( + sharding_strategy='HYBRID_SHARD', + sharded_ckpt_prefix_dir='ba{batch}', + data_parallel_shard_degree=2, + data_parallel_replicate_degree=2, + sync_module_states=True, + ) + else: + fsdp_config = FSDPConfig( + sharded_ckpt_prefix_dir='ba{batch}', + sync_module_states=load_monolith_rank0_only, + load_monolith_rank0_only=load_monolith_rank0_only, + ) tp_config = None if use_tp: from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel @@ -778,23 +790,33 @@ def mock_get_checkpoint_validation_function(): @pytest.mark.gpu @pytest.mark.parametrize('use_remote', [pytest.param(True, marks=pytest.mark.remote), False]) @pytest.mark.parametrize( - 'world_size,weights_only,optimizer,precision,autoresume,load_ignore_keys,use_symlink,use_tp', + 'weights_only,optimizer,precision,autoresume,load_ignore_keys,use_symlink,use_tp,use_hsdp', [ - pytest.param(2, False, 'adamw', 'amp_bf16', False, None, False, False, marks=pytest.mark.world_size(2)), - pytest.param(2, True, 'adamw', 'amp_bf16', False, None, False, False, marks=pytest.mark.world_size(2)), - pytest.param(2, False, 'adam', 'amp_bf16', False, None, False, False, marks=pytest.mark.world_size(2)), - pytest.param(2, False, 'adamw', 'amp_fp16', False, None, False, False, marks=pytest.mark.world_size(2)), - pytest.param(2, False, 'adamw', 'amp_bf16', True, None, False, False, marks=pytest.mark.world_size(2)), - pytest.param(2, False, 'adamw', 'amp_bf16', False, ['rng'], False, False, marks=pytest.mark.world_size(2)), - pytest.param(2, False, 'adamw', 'amp_bf16', False, None, True, False, marks=pytest.mark.world_size(2)), - pytest.param(2, False, 'adamw', 'amp_bf16', False, None, False, True, marks=pytest.mark.world_size(4)), + pytest.param(False, 'adamw', 'amp_bf16', False, None, False, False, False, marks=pytest.mark.world_size(2)), + pytest.param(True, 'adamw', 'amp_bf16', False, None, False, False, False, marks=pytest.mark.world_size(2)), + pytest.param(False, 'adam', 'amp_bf16', False, None, False, False, False, marks=pytest.mark.world_size(2)), + pytest.param(False, 'adamw', 'amp_fp16', False, None, False, False, False, marks=pytest.mark.world_size(2)), + pytest.param(False, 'adamw', 'amp_bf16', True, None, False, False, False, marks=pytest.mark.world_size(2)), + pytest.param( + False, + 'adamw', + 'amp_bf16', + False, + ['rng'], + False, + False, + False, + marks=pytest.mark.world_size(2), + ), + pytest.param(False, 'adamw', 'amp_bf16', False, None, True, False, False, marks=pytest.mark.world_size(2)), + pytest.param(False, 'adamw', 'amp_bf16', False, None, False, True, False, marks=pytest.mark.world_size(4)), + pytest.param(False, 'adamw', 'amp_bf16', False, None, False, False, True, marks=pytest.mark.world_size(4)), ], ) @pytest.mark.filterwarnings(r'ignore:TypedStorage is deprecated.:UserWarning') @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning') @pytest.mark.filterwarnings(r'ignore:Please use DTensor instead and we are deprecating ShardedTensor.:UserWarning') def test_fsdp_partitioned_state_dict_load( - world_size, tmp_path: pathlib.Path, autoresume: bool, precision: str, @@ -803,6 +825,7 @@ def test_fsdp_partitioned_state_dict_load( load_ignore_keys: Union[list[str], None], use_symlink: bool, use_tp: bool, + use_hsdp: bool, use_remote, s3_bucket, s3_ephemeral_prefix, @@ -829,10 +852,19 @@ def test_fsdp_partitioned_state_dict_load( save_filename = 'ba{batch}-rank{rank}.pt' - fsdp_config = FSDPConfig(state_dict_type='sharded', sharded_ckpt_prefix_dir='ba{batch}') + if use_hsdp: + fsdp_config = FSDPConfig( + sharding_strategy='HYBRID_SHARD', + sharded_ckpt_prefix_dir='ba{batch}', + state_dict_type='sharded', + data_parallel_shard_degree=2, + data_parallel_replicate_degree=2, + sync_module_states=True, + ) + else: + fsdp_config = FSDPConfig(state_dict_type='sharded', sharded_ckpt_prefix_dir='ba{batch}') tp_config = None if use_tp: - fsdp_config = FSDPConfig(state_dict_type='sharded', sharded_ckpt_prefix_dir='ba{batch}') from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel tp_config = { 'tensor_parallel_degree': 2, From 450130572ec77cb2327fbce96488ba4544dc79e3 Mon Sep 17 00:00:00 2001 From: Jack Zhang <170473087+JackZ-db@users.noreply.github.com> Date: Mon, 24 Jun 2024 17:57:08 -0700 Subject: [PATCH 37/69] bumping mlflow to 2.14.1 (#3425) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0afe4ed314..a1deea27c7 100644 --- a/setup.py +++ b/setup.py @@ -223,7 +223,7 @@ def package_files(prefix: str, directory: str, extension: str): ] extra_deps['mlflow'] = [ - 'mlflow>=2.11.1,<3.0', + 'mlflow>=2.14.1,<3.0', 'databricks-sdk==0.28.0', 'pynvml>=11.5.0,<12', ] From a7218d151f691649b846aaa32c7cbcd1fd6d90c4 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 25 Jun 2024 16:42:37 -0400 Subject: [PATCH 38/69] Skip HSDP + TP pytests that require torch 2.3 or above (#3426) * test * skip if torch version less than 2.3 * typo in ema * add remote * comments --------- Co-authored-by: v-chen_data --- tests/trainer/test_fsdp_checkpoint.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py index 2e5fd5d07b..3b4f26024c 100644 --- a/tests/trainer/test_fsdp_checkpoint.py +++ b/tests/trainer/test_fsdp_checkpoint.py @@ -316,6 +316,8 @@ def test_fsdp_full_state_dict_load( ): if use_hsdp: pytest.xfail('Known Pytorch issue with HSDP, waiting for pytorch patch') + if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'): + pytest.skip('HSDP and TP require torch 2.3.0 or later') if autoresume: run_name = 'my-cool-autoresume-run' else: @@ -833,8 +835,8 @@ def test_fsdp_partitioned_state_dict_load( ): if weights_only and autoresume: pytest.skip('Weights only with autoresume is not supported') - if use_tp and version.parse(torch.__version__) < version.parse('2.3.0'): - pytest.skip('TP requires torch 2.3.0 or later') + if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'): + pytest.skip('HSDP and TP require torch 2.3.0 or later') load_ignore_keys = [] if load_ignore_keys is None else load_ignore_keys From 83618629ec09b9d919c1350781831e8196377ad2 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 26 Jun 2024 14:38:04 -0700 Subject: [PATCH 39/69] remove codeql (#3429) --- .github/workflows/codeql-analysis.yml | 52 --------------------------- 1 file changed, 52 deletions(-) delete mode 100644 .github/workflows/codeql-analysis.yml diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml deleted file mode 100644 index 0cb835fbde..0000000000 --- a/.github/workflows/codeql-analysis.yml +++ /dev/null @@ -1,52 +0,0 @@ -# For most projects, this workflow file will not need changing; you simply need -# to commit it to your repository. -# -# You may wish to alter this file to override the set of languages analyzed, -# or to provide custom queries or build logic. -# -# ******** NOTE ******** -# We have attempted to detect the languages in your repository. Please check -# the `language` matrix defined below to confirm you have the correct set of -# supported CodeQL languages. -# -name: "CodeQL" - -on: - push: - branches: [dev, main] - pull_request: - # The branches below must be a subset of the branches above - branches: [dev, main] - schedule: - - cron: "0 9 * * 1" # Every Monday at 09:00 (9:00 AM) - -jobs: - analyze: - name: Analyze - runs-on: ubuntu-latest - permissions: - actions: read - contents: read - security-events: write - - strategy: - fail-fast: false - matrix: - language: ["python"] - # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', - # 'python', 'ruby' ] - # Learn more about CodeQL language support at - # https://git.io/codeql-language-support - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - name: Get composite run steps repository - uses: actions/checkout@v3 - with: - repository: mosaicml/ci-testing - ref: v0.0.8 - path: ./ci-testing - - uses: ./ci-testing/.github/actions/codeql-analysis - with: - language: ${{ matrix.language }} From 0b749339e3040362e6c9740a2e68276d41f0258d Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Thu, 27 Jun 2024 09:56:20 -0700 Subject: [PATCH 40/69] Remove save overwrite (#3431) * remove save overwrite * fix tests * lint * remove bad test --- composer/trainer/trainer.py | 5 ----- tests/trainer/test_checkpoint.py | 26 +++++++++++++++++--------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 91dd0b1e19..f5a6b57d77 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1732,11 +1732,6 @@ def __init__( error_message = '' if save_folder is None: error_message += 'The `save_folder` must be specified when autoresume is enabled. ' - if save_overwrite: - error_message += textwrap.dedent( - 'The flag `save_overwrite` must be False when autoresume is enabled as autoresume always loads the ' - 'latest existing checkpoint in `save_folder`. ', - ) if save_latest_filename is None: error_message += 'The `save_latest_filename` must be specified so autoresume knows where to load checkpoints from. ' if error_message != '': diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index d23b55875f..9912563eb8 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -730,11 +730,19 @@ def get_logger(self, tmp_path: pathlib.Path): @world_size(1, 2) @device('cpu', 'gpu') - @pytest.mark.parametrize('file_extension', ['.pt', '.tar.gz', '.pt.lz4']) @pytest.mark.parametrize('use_object_store', [True, False]) @pytest.mark.parametrize('delete_local', [True, False]) @pytest.mark.parametrize('test_slashed', [True, False]) - @pytest.mark.parametrize('save_metrics', [True, False]) + @pytest.mark.parametrize( + 'file_extension,save_metrics,save_overwrite', + [ + ['.pt', False, False], + ['.tar.gz', False, False], + ['.pt.lz4', False, False], + ['.pt', True, False], + ['.pt', False, True], + ], + ) def test_autoresume( self, device: str, @@ -744,6 +752,7 @@ def test_autoresume( delete_local: bool, test_slashed: bool, save_metrics: bool, + save_overwrite: bool, world_size: int, ): if delete_local and not use_object_store: @@ -786,6 +795,7 @@ def test_autoresume( autoresume=True, load_path='ignore_me.pt', # this should be ignored load_ignore_keys=['*'], # this should be ignored + save_overwrite=save_overwrite, loggers=[self.get_logger(tmp_path)] if use_object_store else [], ) @@ -1212,19 +1222,17 @@ def test_load_weights_object_store(self, tmp_path): ) @pytest.mark.parametrize( - 'run_name,save_folder,save_overwrite,latest_filename', + 'run_name,save_folder,latest_filename', [ - [None, 'first', False, 'latest-rank{rank}.pt'], - ['big-chungus', None, False, 'latest-rank{rank}.pt'], - ['big-chungus', 'first', True, 'latest-rank{rank}.pt'], - ['big-chungus', 'first', False, None], + [None, 'first', 'latest-rank{rank}.pt'], + ['big-chungus', None, 'latest-rank{rank}.pt'], + ['big-chungus', 'first', None], ], ) - def test_autoresume_fail(self, run_name, save_folder, save_overwrite, latest_filename): + def test_autoresume_fail(self, run_name, save_folder, latest_filename): with pytest.raises(ValueError): self.get_trainer( latest_filename=latest_filename, - save_overwrite=save_overwrite, save_folder=save_folder, run_name=run_name, autoresume=True, From dd3e7f904a2f5786559518863f7a7a47718cec28 Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Fri, 28 Jun 2024 11:24:35 -0700 Subject: [PATCH 41/69] LeDocs (#3430) --- docs/source/notes/distributed_training.rst | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/docs/source/notes/distributed_training.rst b/docs/source/notes/distributed_training.rst index 192167c935..9422e4280b 100644 --- a/docs/source/notes/distributed_training.rst +++ b/docs/source/notes/distributed_training.rst @@ -540,23 +540,24 @@ Composer integrates Pytorch's `Tensor Parallel `__. -This config is passed under `parallelism_config['tp']` to the Composer Trainer. An important parameters -which do not map include `tensor_parallel_degree`, which dictates the number of devices to shard across. +This config is passed under `parallelism_config['tp']` to the Composer Trainer. Important parameters +which do not directly map include `tensor_parallel_degree`, which dictates the number of devices to shard across, +and `layer_plan`, which simply corresponds to torch's `parallelize_plan`. -An example code snippet for using FSDP with composer is provided below: +An example code snippet for using TP and FSDP with Composer is provided below: .. code:: python @@ -624,10 +625,12 @@ An example code snippet for using FSDP with composer is provided below: } } - trainer = Trainer( model=composer_model, - parallelism_config={'fsdp': fsdp_config}, + parallelism_config={ + 'fsdp': fsdp_config, + 'tp': tp_config, + }, ... ) From ac4bd59d130c47238427467a0b79d724224a351d Mon Sep 17 00:00:00 2001 From: Chen Qian Date: Fri, 28 Jun 2024 15:06:12 -0700 Subject: [PATCH 42/69] Lower the system metrics logging frequency to reduce MLflow server's load (#3436) * lower the system metrics logging frequency * more frequent --- composer/loggers/mlflow_logger.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py index 526a7962fd..c90f167b82 100644 --- a/composer/loggers/mlflow_logger.py +++ b/composer/loggers/mlflow_logger.py @@ -123,6 +123,13 @@ def __init__( if logging_buffer_seconds: os.environ['MLFLOW_ASYNC_LOGGING_BUFFERING_SECONDS'] = str(logging_buffer_seconds) + if log_system_metrics: + # Set system metrics sampling interval and samples before logging so that system metrics + # are collected every 5s, and aggregated over 3 samples before being logged + # (logging per 15s). + mlflow.set_system_metrics_samples_before_logging(3) + mlflow.set_system_metrics_sampling_interval(5) + self._rank_zero_only = rank_zero_only self._last_flush_time = time.time() self._flush_interval = flush_interval From 38e5e515f7f569b833cf118c4f7bf17646bb9e5b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Jul 2024 07:39:48 -0700 Subject: [PATCH 43/69] Update paramiko requirement from <3,>=2.11.0 to >=3.4.0,<4 (#3439) Updates the requirements on [paramiko](https://github.com/paramiko/paramiko) to permit the latest version. - [Commits](https://github.com/paramiko/paramiko/compare/2.11.0...3.4.0) --- updated-dependencies: - dependency-name: paramiko dependency-type: direct:development ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a1deea27c7..768f6655ea 100644 --- a/setup.py +++ b/setup.py @@ -202,7 +202,7 @@ def package_files(prefix: str, directory: str, extension: str): extra_deps['streaming'] = [ 'mosaicml-streaming<1.0', 'boto3>=1.21.45,<2', - 'paramiko>=2.11.0,<3', + 'paramiko>=3.4.0,<4', ] extra_deps['libcloud'] = [ From 6b461d0333cece7a6e537def54076de8389dbb81 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Mon, 1 Jul 2024 08:17:14 -0700 Subject: [PATCH 44/69] bump versions (#3433) --- .github/workflows/code-quality.yaml | 2 +- .github/workflows/coverage.yaml | 2 +- .github/workflows/daily.yaml | 4 ++-- .github/workflows/pr-cpu.yaml | 2 +- .github/workflows/pr-gpu.yaml | 6 +++--- .github/workflows/release.yaml | 2 +- .github/workflows/smoketest.yaml | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml index c35546f4ca..432e031cb4 100644 --- a/.github/workflows/code-quality.yaml +++ b/.github/workflows/code-quality.yaml @@ -34,7 +34,7 @@ jobs: uses: actions/checkout@v3 with: repository: mosaicml/ci-testing - ref: v0.0.8 + ref: v0.0.9 path: ./ci-testing - uses: ./ci-testing/.github/actions/code-quality with: diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml index 9432e8c6c9..fc511d7e60 100644 --- a/.github/workflows/coverage.yaml +++ b/.github/workflows/coverage.yaml @@ -16,7 +16,7 @@ jobs: uses: actions/checkout@v3 with: repository: mosaicml/ci-testing - ref: v0.0.8 + ref: v0.0.9 path: ./ci-testing - uses: ./ci-testing/.github/actions/coverage with: diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index 6b67e857ec..aa97c755c8 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -14,7 +14,7 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }} jobs: daily-pytest-cpu: - uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.8 + uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.9 strategy: matrix: include: @@ -100,7 +100,7 @@ jobs: download-path: artifacts daily-pytest-gpu: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8 + uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9 strategy: matrix: # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 12f471749e..23129715db 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -9,7 +9,7 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }} jobs: pytest-cpu: - uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.8 + uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.9 strategy: matrix: include: diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index f056292a43..f6de8908c1 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -9,7 +9,7 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }} jobs: pytest-gpu-1: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8 + uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9 strategy: matrix: include: @@ -35,7 +35,7 @@ jobs: mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }} pytest-gpu-2: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8 + uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9 strategy: matrix: include: @@ -62,7 +62,7 @@ jobs: pytest-gpu-4: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.8 + uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9 strategy: matrix: include: diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 0b253ea87f..c841e6c150 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -24,7 +24,7 @@ jobs: uses: actions/checkout@v3 with: repository: mosaicml/ci-testing - ref: v0.0.8 + ref: v0.0.9 path: ./ci-testing - uses: ./ci-testing/.github/actions/code-quality with: diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml index e9c6316a8d..b7bb09aaab 100644 --- a/.github/workflows/smoketest.yaml +++ b/.github/workflows/smoketest.yaml @@ -33,7 +33,7 @@ jobs: uses: actions/checkout@v3 with: repository: mosaicml/ci-testing - ref: v0.0.8 + ref: v0.0.9 path: ./ci-testing - uses: ./ci-testing/.github/actions/smoketest with: From 6bac335bf95c848414688cd3013826e111463c2e Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Mon, 1 Jul 2024 09:47:33 -0700 Subject: [PATCH 45/69] fix eval after all (#3445) --- composer/core/event.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/composer/core/event.py b/composer/core/event.py index 1374a32c3f..e88d24109e 100644 --- a/composer/core/event.py +++ b/composer/core/event.py @@ -57,7 +57,7 @@ class Event(StringEnum): # - # + # for eval_dataloader in eval_dataloaders: if should_eval(batch=True): # @@ -70,7 +70,7 @@ class Event(StringEnum): # # - # + # # # From 3cd6e6de5f6506063d99f0945a59a8d3917fe1b1 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Mon, 1 Jul 2024 10:08:06 -0700 Subject: [PATCH 46/69] skip log (#3446) --- composer/loggers/mosaicml_logger.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index cbbcd285c8..8b4ff5942a 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -146,7 +146,8 @@ def predict_end(self, state: State, logger: Logger) -> None: self._flush_metadata(force_flush=True) def close(self, state: State, logger: Logger) -> None: - self._flush_metadata(force_flush=True, future=False) + # Skip flushing metadata as it should be logged by fit/eval/predict_end. Flushing here + # might schedule futures while interpreter is shutting down, which will raise an error. if self._enabled: wait(self._futures) # Ignore raised errors on close From cf76c96d1462a801f31f41d82c56a47a84724999 Mon Sep 17 00:00:00 2001 From: Anna Date: Mon, 1 Jul 2024 11:38:13 -0700 Subject: [PATCH 47/69] Remove MosaicMLLambdaEvalClient (#3432) --- composer/utils/__init__.py | 3 +- composer/utils/eval_client/__init__.py | 2 - .../mosaicml_lambda_eval_client.py | 82 ------------------- .../eval_client/test_mcli_eval_client.py | 42 ---------- 4 files changed, 1 insertion(+), 128 deletions(-) delete mode 100644 composer/utils/eval_client/mosaicml_lambda_eval_client.py delete mode 100644 tests/utils/eval_client/test_mcli_eval_client.py diff --git a/composer/utils/__init__.py b/composer/utils/__init__.py index 9618d5f837..f04da5c0e8 100644 --- a/composer/utils/__init__.py +++ b/composer/utils/__init__.py @@ -30,7 +30,7 @@ is_compressed_pt, ) from composer.utils.device import get_device, is_hpu_installed, is_xla_installed -from composer.utils.eval_client import EvalClient, LambdaEvalClient, LocalEvalClient, MosaicMLLambdaEvalClient +from composer.utils.eval_client import EvalClient, LambdaEvalClient, LocalEvalClient from composer.utils.file_helpers import ( FORMAT_NAME_WITH_DIST_AND_TIME_TABLE, FORMAT_NAME_WITH_DIST_TABLE, @@ -140,7 +140,6 @@ 'EvalClient', 'LambdaEvalClient', 'LocalEvalClient', - 'MosaicMLLambdaEvalClient', 'partial_format', 'add_vision_dataset_transform', 'VersionedDeprecationWarning', diff --git a/composer/utils/eval_client/__init__.py b/composer/utils/eval_client/__init__.py index 95b780043a..98bcdd87dc 100644 --- a/composer/utils/eval_client/__init__.py +++ b/composer/utils/eval_client/__init__.py @@ -6,11 +6,9 @@ from composer.utils.eval_client.eval_client import EvalClient from composer.utils.eval_client.lambda_eval_client import LambdaEvalClient from composer.utils.eval_client.local_eval_client import LocalEvalClient -from composer.utils.eval_client.mosaicml_lambda_eval_client import MosaicMLLambdaEvalClient __all__ = [ 'EvalClient', 'LambdaEvalClient', 'LocalEvalClient', - 'MosaicMLLambdaEvalClient', ] diff --git a/composer/utils/eval_client/mosaicml_lambda_eval_client.py b/composer/utils/eval_client/mosaicml_lambda_eval_client.py deleted file mode 100644 index b0418bf86a..0000000000 --- a/composer/utils/eval_client/mosaicml_lambda_eval_client.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""MCLI compatible eval client.""" -import logging -import os -import time -from http import HTTPStatus - -import mcli -import numpy as np - -from composer.utils.eval_client.eval_client import EvalClient - -__all__ = ['MosaicMLLambdaEvalClient'] -log = logging.getLogger(__name__) - - -class MosaicMLLambdaEvalClient(EvalClient): - """Utility for creating a client for and invoking an AWS Lambda through MCLI.""" - - def __init__(self, backoff: int = 3, num_retries: int = 5) -> None: - """Checks that the requisite environment variables are in the EvalClient. - - `MOSAICML_ACCESS_TOKEN_ENV_VAR` environment variable must be set to access the platform. - """ - from composer.loggers.mosaicml_logger import \ - MOSAICML_ACCESS_TOKEN_ENV_VAR # in-line import to avoid circular import - - if MOSAICML_ACCESS_TOKEN_ENV_VAR not in os.environ: - raise RuntimeError('Cannot use MosaicML Lambda Client Eval without setting MOSAICML_ACCESS_TOKEN_ENV_VAR.') - log.debug('Running code eval through MosaicMLLambdaEvalClient.') - self.backoff = backoff - self.num_retries = num_retries - - def invoke(self, payload: list[list[list[dict[str, str]]]]) -> list[list[list[bool]]]: - """Invoke a batch of provided payloads for code evaluations.""" - num_beams = len(payload[0]) - num_tests = [len(generation_payload[0]) for generation_payload in payload] - cum_tests = (np.cumsum([0] + num_tests[:-1]) * num_beams).tolist() - test_cases = [ - test_case for generation_payload in payload for beam_payload in generation_payload - for test_case in beam_payload - ] - ret_helper = [False] * len(test_cases) - for i in range(self.num_retries): - try: - ret_helper = mcli.get_code_eval_output(test_cases).data # pyright: ignore[reportGeneralTypeIssues] - break - except mcli.MAPIException as e: - if e.status >= 500: - if i == self.num_retries - 1: - log.error(f'Failed to get code eval output after {self.num_retries} retries. Error: {e}') - log.warning(f'Failed to get code eval output, retrying in {self.backoff**i} seconds.') - time.sleep(self.backoff**i) - elif e.status == HTTPStatus.UNAUTHORIZED: - raise RuntimeError( - 'Failed to get code eval output due to UNAUTHORIZED error. ' - 'Please ensure you have access to MosaicMLLambdaEvalClient.', - ) from e - else: - log.error(f'Failed to get code eval output with unexpected MAPIException. Error: {e}') - break - except TimeoutError as e: - if i == self.num_retries - 1: - log.error(f'Failed to get code eval output after {self.num_retries} retries. Error: {e}') - log.warning(f'Failed to get code eval output, retrying in {self.backoff**i} seconds.') - time.sleep(self.backoff**i) - except Exception as e: - log.error(f'Failed to get code eval output with unexpected error. Error: {e}') - break - - ret = [] - for i in range(len(payload)): - ret_payload = [] - for j in range(num_beams): - ret_num_beams = [] - for k in range(num_tests[i]): - ret_num_beams.append(ret_helper[cum_tests[i] + j * num_tests[i] + k]) - ret_payload.append(ret_num_beams) - ret.append(ret_payload) - return ret diff --git a/tests/utils/eval_client/test_mcli_eval_client.py b/tests/utils/eval_client/test_mcli_eval_client.py deleted file mode 100644 index 56f13524a1..0000000000 --- a/tests/utils/eval_client/test_mcli_eval_client.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 -import pytest - -from composer.utils import MosaicMLLambdaEvalClient - - -@pytest.mark.remote -@pytest.mark.gpu # must run on MosaicML platform -@pytest.mark.parametrize( - 'code, result, language', - [ - ['def add_1(x):\n return x + 1', True, 'python'], - ['def add_1(x):\n return y + 1', False, 'python'], - ['def add_1(x):\n while True:\n x += 1', False, 'python'], - ['def add_1(x): return x + 2', False, 'python'], - ['int add_1(int x) {\n\treturn x + 1;\n}', True, 'c++'], - ['int add_1(int x) {\n\treturn y + 1;\n}', False, 'c++'], - ['int add_1(int x) {\n\twhile (true) {\n\t\tx += 1;\n\t}\n}', False, 'c++'], - ['int add_1(int x) {\n\treturn x + 2;\n}', False, 'c++'], - ['int add_1(int x) {\n\treturn x + 1;\n}', True, 'c'], - ['int add_1(int x) {\n\treturn y + 1;\n}', False, 'c'], - ['int add_1(int x) {\n\twhile (true) {\n\t\tx += 1;\n\t}\n}', False, 'c'], - ['int add_1(int x) {\n\treturn x + 2;\n}', False, 'c'], - ['function add_1(x) {\n\treturn x+1;\n}', True, 'javascript'], - ['function add_1(x) {\n\treturn y+1;\n}', False, 'javascript'], - ['function add_1(x) {\n\twhile (true) {\n\t\tx += 1;\n\t}\n}', False, 'javascript'], - ['function add_1(x) {\n\treturn x+2;\n}', False, 'javascript'], - ], -) -def test_mcli_invoke(code, result, language): - """Test invocation function for MosaicMLLambdaEvalClient with code that succeeds, fails compilation, times out, and is incorrect in C, C++, Python, JS. - """ - eval_client = MosaicMLLambdaEvalClient() - input = '(1,)' if language == 'python' else '1' - assert eval_client.invoke([[[{ - 'code': code, - 'input': input, - 'output': '2', - 'entry_point': 'add_1', - 'language': language, - }]]]) == [[[result]]] From 8fbca389a23a0686a7a7b63c9af4455d83c47a33 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Mon, 1 Jul 2024 13:22:20 -0700 Subject: [PATCH 48/69] Relax hf hub pin (#3435) --- pyproject.toml | 2 -- setup.py | 4 ++-- tests/test_full_nlp.py | 2 +- tests/utils/test_inference.py | 7 +++++-- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3b2469b935..8ca97bc494 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -160,8 +160,6 @@ filterwarnings = [ '''ignore:.*an autograd kernel was not registered to the Autograd key.*:UserWarning''', # Ignore save_state_dict / load_state_dict deprecation warnings '''ignore:'.*_state_dict' is deprecated and will be removed in future versions.*:UserWarning''', - # Ignore HF deprecation which affects their own libraries - '''ignore:'.*`resume_download` is deprecated and will be removed in version.*:FutureWarning''' ] # Coverage diff --git a/setup.py b/setup.py index 768f6655ea..4bee19e1cf 100644 --- a/setup.py +++ b/setup.py @@ -179,9 +179,9 @@ def package_files(prefix: str, directory: str, extension: str): ] extra_deps['nlp'] = [ - 'transformers>=4.11,!=4.34.0,<4.42', + 'transformers>=4.11,!=4.34.0,<4.43', 'datasets>=2.4,<3', - 'huggingface-hub>=0.21.2,<0.23', + 'huggingface-hub>=0.21.2,<0.24', ] extra_deps['peft'] = [ diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 0ebb927c67..14380b38fe 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -237,7 +237,7 @@ def inference_test_helper( ('simpletransformer', [], 'torchscript'), ], ) -@pytest.mark.parametrize('onnx_opset_version', [13, None]) +@pytest.mark.parametrize('onnx_opset_version', [14, None]) def test_full_nlp_pipeline( model_type, algorithms, diff --git a/tests/utils/test_inference.py b/tests/utils/test_inference.py index f1c45b8562..e7c374377d 100644 --- a/tests/utils/test_inference.py +++ b/tests/utils/test_inference.py @@ -106,7 +106,7 @@ def test_export_for_inference_input_and_output_names(): @device('cpu', 'gpu') -@pytest.mark.parametrize('onnx_opset_version', [13, None]) +@pytest.mark.parametrize('onnx_opset_version', [14, None]) def test_huggingface_export_for_inference_onnx(onnx_opset_version, tiny_bert_config, device): pytest.importorskip('onnx') pytest.importorskip('onnxruntime') @@ -130,7 +130,10 @@ def test_huggingface_export_for_inference_onnx(onnx_opset_version, tiny_bert_con input_ids = torch.randint(low=0, high=30522, size=(2, 32)) labels = torch.randint(low=0, high=1, size=(2,)) token_type_ids = torch.zeros(size=(2, 32), dtype=torch.int64) - attention_mask = torch.randint(low=0, high=1, size=(2, 32)) + attention_mask = torch.ones(size=(2, 32), dtype=torch.int64) + # Mask some tokens + attention_mask[0, 2:] = 0 + sample_input = { 'input_ids': input_ids, 'labels': labels, From 54d58c962ed9a513a3cd64caf289cdfcd0bbd8c8 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 1 Jul 2024 17:34:31 -0700 Subject: [PATCH 49/69] Pytest skip 2 (#3448) * test * test * test * test * test * test * fix * sleep before skip * fix * pull request target * revert * revery pr_target branches * sleep 1 * 10 sec * uncomment * dist barrier * test * dist works! * update 0.0.9 * mihir comment Co-authored-by: Mihir Patel --------- Co-authored-by: v-chen_data Co-authored-by: Mihir Patel --- tests/trainer/test_fsdp_checkpoint.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py index 3b4f26024c..154ed6b282 100644 --- a/tests/trainer/test_fsdp_checkpoint.py +++ b/tests/trainer/test_fsdp_checkpoint.py @@ -836,6 +836,7 @@ def test_fsdp_partitioned_state_dict_load( if weights_only and autoresume: pytest.skip('Weights only with autoresume is not supported') if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'): + dist.barrier() # Sync to avoid race conditions on cleaning up tmp_path pytest.skip('HSDP and TP require torch 2.3.0 or later') load_ignore_keys = [] if load_ignore_keys is None else load_ignore_keys From 5a129d1d279b3b20a6399d6593da471ac550e631 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Tue, 2 Jul 2024 07:56:56 -0600 Subject: [PATCH 50/69] bump version (#3450) --- composer/_version.py | 2 +- docker/README.md | 4 ++-- docker/build_matrix.yaml | 12 ++++++------ docker/generate_build_matrix.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/composer/_version.py b/composer/_version.py index a38b61a722..82928466f9 100644 --- a/composer/_version.py +++ b/composer/_version.py @@ -3,4 +3,4 @@ """The Composer Version.""" -__version__ = '0.24.0.dev0' +__version__ = '0.23.5' diff --git a/docker/README.md b/docker/README.md index e10af0a194..a0514ecb3d 100644 --- a/docker/README.md +++ b/docker/README.md @@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the | Composer Version | CUDA Support | Docker Tag | |--------------------|----------------|----------------------------------------------------------------| -| 0.23.3 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.23.3` | -| 0.23.3 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.3_cpu` | +| 0.23.5 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.23.5` | +| 0.23.5 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.5_cpu` | **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index faa21b8e89..ee74d12309 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -208,9 +208,9 @@ TORCHVISION_VERSION: 0.16.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 - COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.3 + COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5 CUDA_VERSION: 12.1.1 - IMAGE_NAME: composer-0-23-3 + IMAGE_NAME: composer-0-23-5 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -231,15 +231,15 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/composer:0.23.3 + - mosaicml/composer:0.23.5 - mosaicml/composer:latest TARGET: composer_stage TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 - COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.3 + COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5 CUDA_VERSION: '' - IMAGE_NAME: composer-0-23-3-cpu + IMAGE_NAME: composer-0-23-5-cpu MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' @@ -247,7 +247,7 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/composer:0.23.3_cpu + - mosaicml/composer:0.23.5_cpu - mosaicml/composer:latest_cpu TARGET: composer_stage TORCHVISION_VERSION: 0.18.1 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 9a634b0d36..74d9c7fed4 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -231,7 +231,7 @@ def _main(): composer_entries = [] # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images - composer_versions = ['0.23.3'] # Only build images for the latest composer version + composer_versions = ['0.23.5'] # Only build images for the latest composer version composer_python_versions = [PRODUCTION_PYTHON_VERSION] # just build composer against the latest for product in itertools.product(composer_python_versions, composer_versions, cuda_options): From a0806f6bfa4320cbccdeafa2d934ee7516cb8981 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Jul 2024 07:47:40 -0700 Subject: [PATCH 51/69] Bump ipykernel from 6.29.2 to 6.29.5 (#3459) Bumps [ipykernel](https://github.com/ipython/ipykernel) from 6.29.2 to 6.29.5. - [Release notes](https://github.com/ipython/ipykernel/releases) - [Changelog](https://github.com/ipython/ipykernel/blob/v6.29.5/CHANGELOG.md) - [Commits](https://github.com/ipython/ipykernel/compare/v6.29.2...v6.29.5) --- updated-dependencies: - dependency-name: ipykernel dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4bee19e1cf..6508aad307 100644 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ def package_files(prefix: str, directory: str, extension: str): 'fasteners==0.18', # object store tests require fasteners 'pytest==7.4.4', 'ipython==8.11.0', - 'ipykernel==6.29.2', + 'ipykernel==6.29.5', 'jupyter==1.0.0', 'yamllint==1.35.1', 'recommonmark==0.7.1', From 4b71141da41a7ea4c8e1737868ca0ec365a473bb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Jul 2024 07:48:06 -0700 Subject: [PATCH 52/69] Update torchmetrics requirement (#3460) Updates the requirements on [torchmetrics](https://github.com/Lightning-AI/torchmetrics) to permit the latest version. - [Release notes](https://github.com/Lightning-AI/torchmetrics/releases) - [Changelog](https://github.com/Lightning-AI/torchmetrics/blob/master/CHANGELOG.md) - [Commits](https://github.com/Lightning-AI/torchmetrics/compare/v0.10.0...v1.4.0.post0) --- updated-dependencies: - dependency-name: torchmetrics dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6508aad307..b308678902 100644 --- a/setup.py +++ b/setup.py @@ -78,7 +78,7 @@ def package_files(prefix: str, directory: str, extension: str): install_requires = [ 'pyyaml>=6.0,<7', 'tqdm>=4.62.3,<5', - 'torchmetrics>=0.10.0,<1.3.3', + 'torchmetrics>=1.4.0.post0,<1.4.1', 'torch_optimizer>=0.3.0,<0.4', 'torchvision>=0.13.1,<0.18.2', 'torch>=2.1.2,<2.3.2', From 89db4e2591106c9b69dc7364185bcdad7ca7533e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Jul 2024 07:49:04 -0700 Subject: [PATCH 53/69] Bump databricks-sdk from 0.28.0 to 0.29.0 (#3456) Bumps [databricks-sdk](https://github.com/databricks/databricks-sdk-py) from 0.28.0 to 0.29.0. - [Release notes](https://github.com/databricks/databricks-sdk-py/releases) - [Changelog](https://github.com/databricks/databricks-sdk-py/blob/main/CHANGELOG.md) - [Commits](https://github.com/databricks/databricks-sdk-py/compare/v0.28.0...v0.29.0) --- updated-dependencies: - dependency-name: databricks-sdk dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index b308678902..8f8498392d 100644 --- a/setup.py +++ b/setup.py @@ -224,13 +224,13 @@ def package_files(prefix: str, directory: str, extension: str): extra_deps['mlflow'] = [ 'mlflow>=2.14.1,<3.0', - 'databricks-sdk==0.28.0', + 'databricks-sdk==0.29.0', 'pynvml>=11.5.0,<12', ] extra_deps['pandas'] = ['pandas>=2.0.0,<3.0'] -extra_deps['databricks'] = ['databricks-sdk==0.28.0'] +extra_deps['databricks'] = ['databricks-sdk==0.29.0'] extra_deps['all'] = {dep for deps in extra_deps.values() for dep in deps} From 6df01ba6305f560bf507cacaa02b02e94a451487 Mon Sep 17 00:00:00 2001 From: bigning Date: Mon, 8 Jul 2024 09:48:38 -0700 Subject: [PATCH 54/69] [Checkpoint] Fix symlink issue where symlink file uploaded before checkpoint files upload (#3376) * a * a * a * a * a * a * a * a * fix test * a * a * a * a * fix unit test * a * a * a * a * a * fix 2gpu unit test * a * a * a * a * fix doctest * a * fix test and lint * up * a * a * a * a * a * a * a * a * address comments * a * a * a * a * rerun test * add logging * remove debug comments * comments * a * cleanup * a * linter * lint * Update composer/callbacks/checkpoint_saver.py Co-authored-by: Evan Racah * commenst * a * fix test * fix test * comments * a --------- Co-authored-by: Evan Racah --- composer/callbacks/checkpoint_saver.py | 179 +++++++++-- .../loggers/remote_uploader_downloader.py | 59 +--- composer/trainer/trainer.py | 23 +- composer/utils/__init__.py | 7 + composer/utils/file_helpers.py | 16 + composer/utils/object_store/__init__.py | 2 + composer/utils/object_store/utils.py | 48 +++ composer/utils/remote_uploader.py | 165 +++++++++- docs/source/doctest_fixtures.py | 25 +- .../test_remote_uploader_downloader.py | 16 +- tests/trainer/test_checkpoint.py | 283 ++++++++++-------- tests/utils/test_remote_uploader.py | 26 +- 12 files changed, 607 insertions(+), 242 deletions(-) create mode 100644 composer/utils/object_store/utils.py diff --git a/composer/callbacks/checkpoint_saver.py b/composer/callbacks/checkpoint_saver.py index c17b874c21..29468e66c3 100644 --- a/composer/callbacks/checkpoint_saver.py +++ b/composer/callbacks/checkpoint_saver.py @@ -20,6 +20,8 @@ FORMAT_NAME_WITH_DIST_AND_TIME_TABLE, FORMAT_NAME_WITH_DIST_TABLE, PartialFilePath, + RemoteFilesExistingCheckStatus, + RemoteUploader, checkpoint, create_interval_scheduler, create_symlink_file, @@ -28,6 +30,7 @@ format_name_with_dist, format_name_with_dist_and_time, is_model_deepspeed, + parse_uri, partial_format, ) from composer.utils.checkpoint import _TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME @@ -287,8 +290,13 @@ def __init__( num_checkpoints_to_keep: int = -1, weights_only: bool = False, ignore_keys: Optional[Union[list[str], Callable[[dict], None]]] = None, + num_concurrent_uploads: int = 1, + upload_timeout_in_seconds: int = 3600, ): - folder = str(folder) + backend, _, local_folder = parse_uri(str(folder)) + if local_folder == '': + local_folder = '.' + filename = str(filename) remote_file_name = str(remote_file_name) if remote_file_name is not None else None latest_filename = str(latest_filename) if latest_filename is not None else None @@ -304,10 +312,10 @@ def __init__( self.save_interval = save_interval self.last_checkpoint_batch: Optional[Time] = None - self.folder = folder + self.folder = local_folder - self.filename = PartialFilePath(filename.lstrip('/'), folder) - self.latest_filename = PartialFilePath(latest_filename.lstrip('/'), folder) if latest_filename else None + self.filename = PartialFilePath(filename.lstrip('/'), local_folder) + self.latest_filename = PartialFilePath(latest_filename.lstrip('/'), local_folder) if latest_filename else None self.remote_file_name = PartialFilePath(remote_file_name) if remote_file_name else None self.latest_remote_file_name = PartialFilePath(latest_remote_file_name) if latest_remote_file_name else None @@ -320,6 +328,23 @@ def __init__( self.start_batch = None + self.remote_uploader = None + self.rank_saves_symlinks: bool = False + self.tmp_dir_for_symlink = tempfile.TemporaryDirectory() + self.num_concurrent_uploads = num_concurrent_uploads + self.upload_timeout_in_seconds = upload_timeout_in_seconds + # Allow unit test to override this to make it faster + self._symlink_upload_wait_before_next_try_in_seconds = 30.0 + self.pid = os.getpid() + self.symlink_count = 0 + self.symlink_upload_tasks = [] + + if backend != '': + self.remote_uploader = RemoteUploader( + remote_folder=str(folder), + num_concurrent_uploads=self.num_concurrent_uploads, + ) + def init(self, state: State, logger: Logger) -> None: # If MLFlowLogger is being used, format MLFlow-specific placeholders in the save folder and paths. # Assumes that MLFlowLogger comes before CheckpointSaver in the list of loggers. @@ -346,9 +371,10 @@ def init(self, state: State, logger: Logger) -> None: self.latest_remote_file_name.filename, **mlflow_format_kwargs, ) - break + if self.remote_uploader is not None: + self.remote_uploader.init() folder = format_name_with_dist(self.folder, state.run_name) os.makedirs(folder, exist_ok=True) @@ -410,6 +436,27 @@ def load_state_dict(self, state: dict[str, Any]): load_timestamp.load_state_dict(timestamp_state) self.all_saved_checkpoints_to_timestamp[save_filename] = load_timestamp + def _upload_checkpoint( + self, + remote_file_name: str, + local_file_name: str, + local_remote_file_names: list[str], + logger: Logger, + ): + if self.remote_uploader is not None: + self.remote_uploader.upload_file_async( + remote_file_name=remote_file_name, + file_path=pathlib.Path(local_file_name), + overwrite=self.overwrite, + ) + local_remote_file_names.append(remote_file_name) + else: + logger.upload_file( + remote_file_name=remote_file_name, + file_path=local_file_name, + overwrite=self.overwrite, + ) + def _save_checkpoint(self, state: State, logger: Logger): self.last_checkpoint_batch = state.timestamp.batch @@ -432,7 +479,14 @@ def _save_checkpoint(self, state: State, logger: Logger): ) log.debug(f'Checkpoint locally saved to {saved_path}') + self.symlink_count += 1 + # Remote checkpoint file names on this rank + local_remote_file_names = [] + all_remote_filenames = [] + if not saved_path: # not all ranks save + if self.remote_file_name is not None and self.remote_uploader is not None: + all_remote_filenames = dist.all_gather_object(local_remote_file_names) return metadata_local_file_path = None @@ -443,6 +497,7 @@ def _save_checkpoint(self, state: State, logger: Logger): state.timestamp, ) + self.rank_saves_symlinks = dist.get_global_rank() == 0 or not state.fsdp_sharded_state_dict_enabled if self.latest_filename is not None and self.num_checkpoints_to_keep != 0: symlink = self.latest_filename.format(state, is_deepspeed) os.makedirs(os.path.dirname(symlink), exist_ok=True) @@ -455,8 +510,7 @@ def _save_checkpoint(self, state: State, logger: Logger): src_path = str(pathlib.Path(saved_path).parent) else: src_path = saved_path - this_rank_saves_symlinks = dist.get_global_rank() == 0 or not state.fsdp_sharded_state_dict_enabled - if this_rank_saves_symlinks: + if self.rank_saves_symlinks: os.symlink(os.path.relpath(src_path, os.path.dirname(symlink)), symlink) # if remote file name provided, upload the checkpoint @@ -482,10 +536,11 @@ def _save_checkpoint(self, state: State, logger: Logger): state.timestamp, ) assert metadata_local_file_path is not None - logger.upload_file( + self._upload_checkpoint( remote_file_name=metadata_remote_file_name, - file_path=metadata_local_file_path, - overwrite=self.overwrite, + local_file_name=metadata_local_file_path, + local_remote_file_names=local_remote_file_names, + logger=logger, ) else: remote_file_name = self.remote_file_name.format( @@ -495,12 +550,20 @@ def _save_checkpoint(self, state: State, logger: Logger): log.debug(f'Uploading checkpoint to {remote_file_name}') try: - logger.upload_file(remote_file_name=remote_file_name, file_path=saved_path, overwrite=self.overwrite) + self._upload_checkpoint( + remote_file_name=remote_file_name, + local_file_name=saved_path, + local_remote_file_names=local_remote_file_names, + logger=logger, + ) except FileExistsError as e: raise FileExistsError( f'Uploading checkpoint failed with error: {e}. overwrite was set to {self.overwrite}. To overwrite checkpoints with Trainer, set save_overwrite to True.', ) from e + if self.remote_uploader is not None: + all_remote_filenames = dist.all_gather_object(local_remote_file_names) + # symlinks stay the same with sharded checkpointing if self.latest_remote_file_name is not None: symlink_name = self.latest_remote_file_name.format( @@ -509,17 +572,31 @@ def _save_checkpoint(self, state: State, logger: Logger): ).lstrip('/') + '.symlink' # create and upload a symlink file - with tempfile.TemporaryDirectory() as tmpdir: - symlink_filename = os.path.join(tmpdir, 'latest.symlink') - # Sharded checkpoints for torch >2.0 use directories not files for load_paths - if state.fsdp_sharded_state_dict_enabled: - src_path = str(pathlib.Path(remote_file_name).parent) + symlink_filename = os.path.join( + self.tmp_dir_for_symlink.name, + f'latest.{self.symlink_count}.symlink', + ) + # Sharded checkpoints for torch >2.0 use directories not files for load_paths + if state.fsdp_sharded_state_dict_enabled: + src_path = str(pathlib.Path(remote_file_name).parent) + else: + src_path = remote_file_name + log.debug(f'Creating symlink file {symlink_filename} -> {src_path}') + if self.rank_saves_symlinks: + create_symlink_file(src_path, symlink_filename) + if self.remote_uploader is not None: + remote_checkpoint_file_names = [] + for file_names in all_remote_filenames: + remote_checkpoint_file_names += file_names + check_remote_files_exist_future = self.remote_uploader.check_remote_files_exist_async( + remote_checkpoint_file_names=remote_checkpoint_file_names, + max_wait_time_in_seconds=self.upload_timeout_in_seconds, + wait_before_next_try_in_seconds=self._symlink_upload_wait_before_next_try_in_seconds, + ) + self.symlink_upload_tasks.append( + (check_remote_files_exist_future, symlink_filename, symlink_name), + ) else: - src_path = remote_file_name - log.debug(f'Creating symlink file {symlink_filename} -> {src_path}') - this_rank_saves_symlinks = dist.get_global_rank() == 0 or not state.fsdp_sharded_state_dict_enabled - if this_rank_saves_symlinks: - create_symlink_file(src_path, symlink_filename) logger.upload_file( remote_file_name=symlink_name, file_path=symlink_filename, @@ -532,7 +609,6 @@ def _save_checkpoint(self, state: State, logger: Logger): self._rotate_checkpoints(sharding_enabled=state.fsdp_sharded_state_dict_enabled) def _rotate_checkpoints(self, sharding_enabled: bool = False): - while len(self.saved_checkpoints) > self.num_checkpoints_to_keep: prefix_dir = None checkpoint_to_delete = self.saved_checkpoints.pop(0) @@ -542,3 +618,62 @@ def _rotate_checkpoints(self, sharding_enabled: bool = False): else: if dist.get_global_rank() == 0: shutil.rmtree(prefix_dir) + + def batch_end(self, state: State, logger: Logger) -> None: + del state, logger # unused + if self.remote_uploader is None: + return + self.remote_uploader.check_workers() + if not self.rank_saves_symlinks: + return + undone_symlink_upload_tasks = [] + for (check_remote_files_exist_future, local_symlink_file, + remote_symlink_file) in reversed(self.symlink_upload_tasks): + if not check_remote_files_exist_future.done(): + undone_symlink_upload_tasks.insert( + 0, + (check_remote_files_exist_future, local_symlink_file, remote_symlink_file), + ) + continue + if check_remote_files_exist_future.done(): + result = check_remote_files_exist_future.result() + if result == RemoteFilesExistingCheckStatus.EXIST: + self.remote_uploader.upload_file_async( + remote_file_name=remote_symlink_file, + file_path=local_symlink_file, + overwrite=True, + ) + break + else: + raise RuntimeError(f'Failed to check if checkpoint files upload finish: {result}') + self.symlink_upload_tasks = undone_symlink_upload_tasks + + def fit_end(self, state: State, logger: Logger) -> None: + del state, logger # unused + if self.remote_uploader is None: + return + log.info('Waiting for checkpoint uploading to finish') + self.remote_uploader.wait() + if self.rank_saves_symlinks and len(self.symlink_upload_tasks) > 0: + log.debug('Uploading symlink to the latest checkpoint') + # We only need to upload a symlink pointing to the latest checkpoint files, so we can ignore successful uploads of older checkpoints. + check_remote_files_exist_future, local_symlink_file, remote_symlink_file = self.symlink_upload_tasks[-1] + result = check_remote_files_exist_future.result() + if result == RemoteFilesExistingCheckStatus.EXIST: + symlink_upload_future = self.remote_uploader.upload_file_async( + remote_file_name=remote_symlink_file, + file_path=local_symlink_file, + overwrite=True, + ) + symlink_upload_future.result() + else: + raise RuntimeError(f'Failed to check if checkpoint files upload finish: {result}') + log.info('Checkpoint uploading finished!') + + def post_close(self): + if self.remote_uploader is not None: + # Wait the symlink file upload to finish and close remote uploader + try: + self.remote_uploader.wait_and_close() + except Exception as e: + log.error(f'RemoteUploader run into exception {e}') diff --git a/composer/loggers/remote_uploader_downloader.py b/composer/loggers/remote_uploader_downloader.py index 981cc4c650..9378d5a8d4 100644 --- a/composer/loggers/remote_uploader_downloader.py +++ b/composer/loggers/remote_uploader_downloader.py @@ -25,19 +25,15 @@ from composer.loggers import Logger, MosaicMLLogger from composer.loggers.logger_destination import LoggerDestination from composer.utils import ( - GCSObjectStore, - LibcloudObjectStore, MLFlowObjectStore, ObjectStore, ObjectStoreTransientError, - OCIObjectStore, - S3ObjectStore, - SFTPObjectStore, - UCObjectStore, + build_remote_backend, dist, format_name_with_dist, get_file, retry, + validate_credentials, ) from composer.utils.object_store.mlflow_object_store import MLFLOW_DBFS_PATH_PREFIX @@ -50,37 +46,6 @@ __all__ = ['RemoteUploaderDownloader'] -def _build_remote_backend(remote_backend_name: str, backend_kwargs: dict[str, Any]): - remote_backend_cls = None - remote_backend_name_to_cls = { - 's3': S3ObjectStore, - 'oci': OCIObjectStore, - 'sftp': SFTPObjectStore, - 'libcloud': LibcloudObjectStore, - 'gs': GCSObjectStore, - } - - # Handle `dbfs` backend as a special case, since it can map to either :class:`.UCObjectStore` - # or :class:`.MLFlowObjectStore`. - if remote_backend_name == 'dbfs': - path = backend_kwargs['path'] - if path.startswith(MLFLOW_DBFS_PATH_PREFIX): - remote_backend_cls = MLFlowObjectStore - else: - # Validate if the path conforms to the requirements for UC volume paths - UCObjectStore.validate_path(path) - remote_backend_cls = UCObjectStore - else: - remote_backend_cls = remote_backend_name_to_cls.get(remote_backend_name, None) - if remote_backend_cls is None: - supported_remote_backends = list(remote_backend_name_to_cls.keys()) + ['dbfs'] - raise ValueError( - f'The remote backend {remote_backend_name} is not supported. Please use one of ({supported_remote_backends})', - ) - - return remote_backend_cls(**backend_kwargs) - - class RemoteUploaderDownloader(LoggerDestination): r"""Logger destination that uploads (downloads) files to (from) a remote backend. @@ -339,7 +304,7 @@ def __init__( def remote_backend(self) -> ObjectStore: """The :class:`.ObjectStore` instance for the main thread.""" if self._remote_backend is None: - self._remote_backend = _build_remote_backend(self.remote_backend_name, self.backend_kwargs) + self._remote_backend = build_remote_backend(self.remote_backend_name, self.backend_kwargs) return self._remote_backend def init(self, state: State, logger: Logger) -> None: @@ -359,7 +324,7 @@ def init(self, state: State, logger: Logger) -> None: retry( ObjectStoreTransientError, self.num_attempts, - )(lambda: _validate_credentials(self.remote_backend, file_name_to_test))() + )(lambda: validate_credentials(self.remote_backend, file_name_to_test))() # If the remote backend is an `MLFlowObjectStore`, the original path kwarg may have placeholders that can be # updated with information generated at runtime, i.e., the MLFlow experiment and run IDs. This information @@ -635,20 +600,6 @@ def _remote_file_name(self, remote_file_name: str): return key_name -def _validate_credentials( - remote_backend: ObjectStore, - remote_file_name_to_test: str, -) -> None: - # Validates the credentials by attempting to touch a file in the bucket - # raises an error if there was a credentials failure. - with tempfile.NamedTemporaryFile('wb') as f: - f.write(b'credentials_validated_successfully') - remote_backend.upload_object( - object_name=remote_file_name_to_test, - filename=f.name, - ) - - def _upload_worker( file_queue: Union[queue.Queue[tuple[str, str, bool]], multiprocessing.JoinableQueue[tuple[str, str, bool]]], completed_queue: Union[queue.Queue[str], multiprocessing.JoinableQueue[str]], @@ -663,7 +614,7 @@ def _upload_worker( The worker will continuously poll ``file_queue`` for files to upload. Once ``is_finished`` is set, the worker will exit once ``file_queue`` is empty. """ - remote_backend = _build_remote_backend(remote_backend_name, backend_kwargs) + remote_backend = build_remote_backend(remote_backend_name, backend_kwargs) while True: try: file_path_to_upload, remote_file_name, overwrite = file_queue.get(block=True, timeout=0.5) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index f5a6b57d77..c752187ba6 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1387,16 +1387,6 @@ def __init__( mosaicml_logger = MosaicMLLogger() loggers.append(mosaicml_logger) - # Remote Uploader Downloader - # Keep the ``RemoteUploaderDownloader`` below client-provided loggers so the loggers init callbacks run before - # the ``RemoteUploaderDownloader`` init. This is necessary to use an ``MLFlowObjectStore`` to log objects to a - # run managed by an ``MLFlowLogger``, as the ``MLFlowObjectStore`` relies on the ``MLFlowLogger`` to initialize - # the active MLFlow run. - if save_folder is not None: - remote_ud = maybe_create_remote_uploader_downloader_from_uri(save_folder, loggers) - if remote_ud is not None: - loggers.append(remote_ud) - # Logger self.logger = Logger(state=self.state, destinations=loggers) @@ -1451,14 +1441,12 @@ def __init__( # path then we assume they just want their checkpoints saved directly in their # bucket. if parsed_save_folder == '': - folder = '.' remote_file_name = save_filename latest_remote_file_name = save_latest_filename # If they actually specify a path, then we use that for their local save path # and we prefix save_filename with that path for remote_file_name. else: - folder = parsed_save_folder remote_file_name = str(Path(parsed_save_folder) / Path(save_filename)) if save_latest_filename is not None: latest_remote_file_name = str(Path(parsed_save_folder) / Path(save_latest_filename)) @@ -1466,7 +1454,7 @@ def __init__( latest_remote_file_name = None self._checkpoint_saver = CheckpointSaver( - folder=folder, + folder=save_folder, filename=save_filename, remote_file_name=remote_file_name, latest_filename=save_latest_filename, @@ -1889,14 +1877,17 @@ def _try_checkpoint_download( self, latest_checkpoint_path: str, save_latest_remote_file_name: str, - loggers: Sequence[LoggerDestination], + loggers: Sequence[Union[LoggerDestination, ObjectStore]], load_progress_bar: bool, ) -> None: """Attempts to download the checkpoint from the logger destinations.""" log.debug( f'Trying to download {save_latest_remote_file_name} to {latest_checkpoint_path} on rank {dist.get_global_rank()}', ) - for logger in loggers: + remote_destination = list(loggers) + if self._checkpoint_saver is not None and self._checkpoint_saver.remote_uploader is not None: + remote_destination.append(self._checkpoint_saver.remote_uploader.remote_backend) + for logger in remote_destination: try: # Fetch from logger. If it succeeds, stop trying the rest of the loggers get_file( @@ -1938,7 +1929,7 @@ def _get_autoresume_checkpoint( f'Looking for autoresume checkpoint: {save_latest_remote_file_name} (remote), {latest_checkpoint_path} (local)', ) - if self.state.deepspeed_enabled or self.state.fsdp_sharded_state_dict_enabled: + if self.state.deepspeed_enabled: # If latest checkpoint is not saved locally, try to fetch from loggers if not os.path.exists(latest_checkpoint_path): log.debug(f'Attempting to download the checkpoint on to rank {dist.get_global_rank()}') diff --git a/composer/utils/__init__.py b/composer/utils/__init__.py index f04da5c0e8..0850fd2bdd 100644 --- a/composer/utils/__init__.py +++ b/composer/utils/__init__.py @@ -44,6 +44,7 @@ maybe_create_object_store_from_uri, maybe_create_remote_uploader_downloader_from_uri, parse_uri, + validate_credentials, ) from composer.utils.import_helpers import MissingConditionalImportError, import_object from composer.utils.inference import ExportFormat, Transform, export_for_inference, export_with_logger, quantize_dynamic @@ -72,8 +73,10 @@ S3ObjectStore, SFTPObjectStore, UCObjectStore, + build_remote_backend, ) from composer.utils.parallelism import FSDPConfig, ParallelismConfig, TPConfig, create_fsdp_config +from composer.utils.remote_uploader import RemoteFilesExistingCheckStatus, RemoteUploader from composer.utils.retrying import retry from composer.utils.string_enum import StringEnum from composer.utils.warnings import VersionedDeprecationWarning @@ -155,4 +158,8 @@ 'ParallelismConfig', 'MLFLOW_EXPERIMENT_ID_FORMAT_KEY', 'MLFLOW_RUN_ID_FORMAT_KEY', + 'RemoteUploader', + 'validate_credentials', + 'build_remote_backend', + 'RemoteFilesExistingCheckStatus', ] diff --git a/composer/utils/file_helpers.py b/composer/utils/file_helpers.py index 2d14cc27ea..11d10328ea 100644 --- a/composer/utils/file_helpers.py +++ b/composer/utils/file_helpers.py @@ -49,6 +49,7 @@ 'maybe_create_object_store_from_uri', 'maybe_create_remote_uploader_downloader_from_uri', 'parse_uri', + 'validate_credentials', ] @@ -737,3 +738,18 @@ def create_symlink_file( raise ValueError('The symlink filename must end with .symlink.') with open(destination_filename, 'x') as f: f.write(existing_path) + + +def validate_credentials( + remote_backend: ObjectStore, + remote_file_name_to_test: str, +): + """Upload a tiny text file to test if the credentials are setup correctly.""" + # Validates the credentials by attempting to touch a file in the bucket + # raises an error if there was a credentials failure. + with tempfile.NamedTemporaryFile('wb') as f: + f.write(b'credentials_validated_successfully') + remote_backend.upload_object( + object_name=remote_file_name_to_test, + filename=f.name, + ) diff --git a/composer/utils/object_store/__init__.py b/composer/utils/object_store/__init__.py index 3c70257e08..6171013c2c 100644 --- a/composer/utils/object_store/__init__.py +++ b/composer/utils/object_store/__init__.py @@ -15,6 +15,7 @@ from composer.utils.object_store.s3_object_store import S3ObjectStore from composer.utils.object_store.sftp_object_store import SFTPObjectStore from composer.utils.object_store.uc_object_store import UCObjectStore +from composer.utils.object_store.utils import build_remote_backend __all__ = [ 'ObjectStore', @@ -28,4 +29,5 @@ 'UCObjectStore', 'MLFLOW_EXPERIMENT_ID_FORMAT_KEY', 'MLFLOW_RUN_ID_FORMAT_KEY', + 'build_remote_backend', ] diff --git a/composer/utils/object_store/utils.py b/composer/utils/object_store/utils.py new file mode 100644 index 0000000000..0d33774bc7 --- /dev/null +++ b/composer/utils/object_store/utils.py @@ -0,0 +1,48 @@ +# Copyright 2024 MosaicML Composer authors +# SPDX-License-Identifier: Apache-2.0 + +"""Helpers for working with object stores.""" + +from typing import Any + +from composer.utils.object_store.gcs_object_store import GCSObjectStore +from composer.utils.object_store.libcloud_object_store import LibcloudObjectStore +from composer.utils.object_store.mlflow_object_store import MLFLOW_DBFS_PATH_PREFIX, MLFlowObjectStore +from composer.utils.object_store.oci_object_store import OCIObjectStore +from composer.utils.object_store.s3_object_store import S3ObjectStore +from composer.utils.object_store.sftp_object_store import SFTPObjectStore +from composer.utils.object_store.uc_object_store import UCObjectStore + +__all__ = ['build_remote_backend'] + + +def build_remote_backend(remote_backend_name: str, backend_kwargs: dict[str, Any]): + """Build object store given the backend name and kwargs.""" + remote_backend_cls = None + remote_backend_name_to_cls = { + 's3': S3ObjectStore, + 'oci': OCIObjectStore, + 'sftp': SFTPObjectStore, + 'libcloud': LibcloudObjectStore, + 'gs': GCSObjectStore, + } + + # Handle `dbfs` backend as a special case, since it can map to either :class:`.UCObjectStore` + # or :class:`.MLFlowObjectStore`. + if remote_backend_name == 'dbfs': + path = backend_kwargs['path'] + if path.startswith(MLFLOW_DBFS_PATH_PREFIX): + remote_backend_cls = MLFlowObjectStore + else: + # Validate if the path conforms to the requirements for UC volume paths + UCObjectStore.validate_path(path) + remote_backend_cls = UCObjectStore + else: + remote_backend_cls = remote_backend_name_to_cls.get(remote_backend_name, None) + if remote_backend_cls is None: + supported_remote_backends = list(remote_backend_name_to_cls.keys()) + ['dbfs'] + raise ValueError( + f'The remote backend {remote_backend_name} is not supported. Please use one of ({supported_remote_backends})', + ) + + return remote_backend_cls(**backend_kwargs) diff --git a/composer/utils/remote_uploader.py b/composer/utils/remote_uploader.py index c26c73a319..33793e7c91 100644 --- a/composer/utils/remote_uploader.py +++ b/composer/utils/remote_uploader.py @@ -12,13 +12,20 @@ import time import uuid from concurrent.futures import Future, ProcessPoolExecutor -from typing import List +from enum import Enum +from typing import Any, Optional -from composer.utils.dist import get_local_rank +from composer.utils.dist import broadcast_object_list, get_global_rank, get_local_rank from composer.utils.file_helpers import ( - maybe_create_object_store_from_uri, + parse_uri, + validate_credentials, ) -from composer.utils.object_store.object_store import ObjectStore, ObjectStoreTransientError +from composer.utils.object_store.mlflow_object_store import MLFLOW_DBFS_PATH_PREFIX, MLFlowObjectStore +from composer.utils.object_store.object_store import ( + ObjectStore, + ObjectStoreTransientError, +) +from composer.utils.object_store.utils import build_remote_backend from composer.utils.retrying import retry log = logging.getLogger(__name__) @@ -26,16 +33,55 @@ __all__ = ['RemoteUploader'] +class RemoteFilesExistingCheckStatus(Enum): + EXIST = 1 + TIMEOUT = 2 + ERROR = 3 + + +def _check_remote_files_exists( + remote_backend_name: str, + backend_kwargs: dict[str, Any], + remote_checkpoint_file_names: list[str], + main_process_pid: int, + is_remote_upload_failed: multiprocessing.Event, # pyright: ignore[reportGeneralTypeIssues] + max_wait_time_in_seconds: int = 3600, + wait_before_next_try_in_seconds: float = 30, +): + start_time = time.time() + object_store = build_remote_backend(remote_backend_name, backend_kwargs) + + for remote_file_name in remote_checkpoint_file_names: + while True: + if is_remote_upload_failed.is_set(): + log.debug(f'Stop symlink uploading since the checkpoint files uploading failed') + return RemoteFilesExistingCheckStatus.ERROR + # Return if parent process exits + try: + os.kill(main_process_pid, 0) + except OSError: + return RemoteFilesExistingCheckStatus.ERROR + try: + object_store.get_object_size(remote_file_name) + break + except Exception as e: + if not isinstance(e, FileNotFoundError): + log.debug(f'Got exception {type(e)}: {str(e)} when accessing remote file {remote_file_name}') + time.sleep(wait_before_next_try_in_seconds) + if time.time() - start_time > max_wait_time_in_seconds: + return RemoteFilesExistingCheckStatus.TIMEOUT + return RemoteFilesExistingCheckStatus.EXIST + + def _upload_file_to_object_store( - remote_folder: str, + remote_backend_name: str, + backend_kwargs: dict[str, Any], remote_file_name: str, local_file_path: str, overwrite: bool, num_attempts: int, ) -> int: - object_store: ObjectStore = maybe_create_object_store_from_uri( - remote_folder, - ) # pyright: ignore[reportGeneralTypeIssues] + object_store = build_remote_backend(remote_backend_name, backend_kwargs) @retry(ObjectStoreTransientError, num_attempts=num_attempts) def upload_file(retry_index: int = 0): @@ -72,6 +118,7 @@ class RemoteUploader: def __init__( self, remote_folder: str, + backend_kwargs: Optional[dict[str, Any]] = None, num_concurrent_uploads: int = 2, num_attempts: int = 3, ): @@ -84,18 +131,80 @@ def __init__( # A folder to use for staging uploads self._tempdir = tempfile.TemporaryDirectory() self._upload_staging_folder = self._tempdir.name + self.remote_backend_name, self.remote_bucket_name, self.path = parse_uri(remote_folder) - self.num_attempts = num_attempts + self.backend_kwargs: dict[str, Any] = backend_kwargs if backend_kwargs is not None else {} + if self.remote_backend_name in ['s3', 'oci', 'gs'] and 'bucket' not in self.backend_kwargs: + self.backend_kwargs['bucket'] = self.remote_bucket_name + elif self.remote_backend_name == 'libcloud': + if 'container' not in self.backend_kwargs: + self.backend_kwargs['container'] = self.remote_bucket_name + elif self.remote_backend_name == 'azure': + self.remote_backend_name = 'libcloud' + self.backend_kwargs = { + 'provider': 'AZURE_BLOBS', + 'container': self.remote_bucket_name, + 'key_environ': 'AZURE_ACCOUNT_NAME', + 'secret_environ': 'AZURE_ACCOUNT_ACCESS_KEY', + } + elif self.remote_backend_name == 'dbfs': + self.backend_kwargs['path'] = self.path + elif self.remote_backend_name == 'wandb': + raise NotImplementedError( + f'There is no implementation for WandB via URI. Please use ' + 'WandBLogger with log_artifacts set to True.', + ) + else: + raise NotImplementedError( + f'There is no implementation for the cloud backend {self.remote_backend_name} via URI. Please use ' + 'one of the supported object stores (s3, oci, gs, azure, dbfs).', + ) - self.executor = ProcessPoolExecutor( + self.num_attempts = num_attempts + self._remote_backend: Optional[ObjectStore] = None + mp_context = multiprocessing.get_context('spawn') + self.upload_executor = ProcessPoolExecutor( max_workers=num_concurrent_uploads, - mp_context=multiprocessing.get_context('spawn'), + mp_context=mp_context, ) + self.check_remote_files_exist_executor = ProcessPoolExecutor( + max_workers=2, + mp_context=mp_context, + ) + self.is_remote_upload_failed = mp_context.Manager().Event() # Used internally to track the future status. # If a future completed successfully, we'll remove it from this list # when check_workers() or wait() is called - self.futures: List[Future] = [] + self.futures: list[Future] = [] + + self.pid = os.getpid() + + @property + def remote_backend(self) -> ObjectStore: + if self._remote_backend is None: + self._remote_backend = build_remote_backend(self.remote_backend_name, self.backend_kwargs) + return self._remote_backend + + def init(self): + # If it's dbfs path like: dbfs:/databricks/mlflow-tracking/{mlflow_experiment_id}/{mlflow_run_id}/ + # We need to fill out the experiment_id and run_id + + if get_global_rank() == 0: + + @retry(ObjectStoreTransientError, num_attempts=self.num_attempts) + def _validate_credential_with_retry(): + validate_credentials(self.remote_backend, '.credentials_validated_successfully') + + _validate_credential_with_retry() + if self.path.startswith(MLFLOW_DBFS_PATH_PREFIX): + if get_global_rank() == 0: + assert isinstance(self.remote_backend, MLFlowObjectStore) + self.path = self.remote_backend.get_dbfs_path(self.path) + path_list = [self.path] + broadcast_object_list(path_list, src=0) + self.path = path_list[0] + self.backend_kwargs['path'] = self.path def upload_file_async( self, @@ -114,9 +223,10 @@ def upload_file_async( shutil.copy2(file_path, copied_path) # Async upload file - future = self.executor.submit( + future = self.upload_executor.submit( _upload_file_to_object_store, - remote_folder=self.remote_folder, + remote_backend_name=self.remote_backend_name, + backend_kwargs=self.backend_kwargs, remote_file_name=remote_file_name, local_file_path=copied_path, overwrite=overwrite, @@ -132,12 +242,13 @@ def check_workers(self): 1. if it completed with exception, raise that exception 2. if it completed without exception, remove it from self.futures """ - done_futures: List[Future] = [] + done_futures: list[Future] = [] for future in self.futures: if future.done(): # future.exception is a blocking call exception_or_none = future.exception() if exception_or_none is not None: + self.is_remote_upload_failed.set() raise exception_or_none else: done_futures.append(future) @@ -153,6 +264,7 @@ def wait(self): for future in self.futures: exception_or_none = future.exception() if exception_or_none is not None: + self.is_remote_upload_failed.set() raise exception_or_none self.futures = [] @@ -165,4 +277,25 @@ def wait_and_close(self): """ # make sure all workers are either running, or completed successfully self.wait() - self.executor.shutdown(wait=True) + self.upload_executor.shutdown(wait=True) + self.check_remote_files_exist_executor.shutdown(wait=True) + log.debug('Finished all uploading tasks, closing RemoteUploader') + + def check_remote_files_exist_async( + self, + remote_checkpoint_file_names: list[str], + max_wait_time_in_seconds: int = 3600, + wait_before_next_try_in_seconds: float = 30, + ): + future = self.check_remote_files_exist_executor.submit( + _check_remote_files_exists, + remote_backend_name=self.remote_backend_name, + backend_kwargs=self.backend_kwargs, + remote_checkpoint_file_names=remote_checkpoint_file_names, + main_process_pid=self.pid, + is_remote_upload_failed=self.is_remote_upload_failed, + max_wait_time_in_seconds=max_wait_time_in_seconds, + wait_before_next_try_in_seconds=wait_before_next_try_in_seconds, + ) + self.futures.append(future) + return future diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py index 553d8d9b60..f54d1f69e1 100644 --- a/docs/source/doctest_fixtures.py +++ b/docs/source/doctest_fixtures.py @@ -54,7 +54,7 @@ from composer.loggers import RemoteUploaderDownloader from composer.models import ComposerModel as ComposerModel from composer.optim.scheduler import ConstantScheduler -from composer.utils import LibcloudObjectStore +from composer.utils import LibcloudObjectStore, RemoteUploader from composer.utils import ensure_tuple as ensure_tuple try: @@ -246,6 +246,29 @@ def _new_RemoteUploaderDownloader_init(self, fake_ellipses: None = None, **kwarg RemoteUploaderDownloader.__init__ = _new_RemoteUploaderDownloader_init # type: ignore +# Patch RemoteUploader __init__ function to replace arguments while preserving type +_original_RemoteUploader_init = RemoteUploader.__init__ + + +def _new_RemoteUploader_init(self, fake_ellipses: None = None, **kwargs: Any): + os.makedirs('./object_store', exist_ok=True) + kwargs.update( + num_concurrent_uploads=1, + remote_folder='libcloud://.', + backend_kwargs={ + 'provider': 'local', + 'container': '.', + 'provider_kwargs': { + 'key': os.path.abspath('./object_store'), + }, + }, + num_attempts=1, + ) + _original_RemoteUploader_init(self, **kwargs) + + +RemoteUploader.__init__ = _new_RemoteUploader_init + # Patch ObjectStore __init__ function to replace arguments while preserving type _original_libcloudObjectStore_init = LibcloudObjectStore.__init__ diff --git a/tests/loggers/test_remote_uploader_downloader.py b/tests/loggers/test_remote_uploader_downloader.py index 1f877d2dd9..b25e23a717 100644 --- a/tests/loggers/test_remote_uploader_downloader.py +++ b/tests/loggers/test_remote_uploader_downloader.py @@ -77,7 +77,7 @@ def object_store_test_helper( # Patching does not work when using multiprocessing with spawn, so we also # patch to use fork fork_context = multiprocessing.get_context('fork') - with patch('composer.loggers.remote_uploader_downloader.S3ObjectStore', DummyObjectStore): + with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore): with patch('composer.loggers.remote_uploader_downloader.multiprocessing.get_context', lambda _: fork_context): remote_uploader_downloader = RemoteUploaderDownloader( bucket_uri='s3://{remote_dir}', @@ -227,7 +227,7 @@ def get_object_size(self, object_name: str) -> int: return super().get_object_size(object_name) fork_context = multiprocessing.get_context('fork') - with patch('composer.loggers.remote_uploader_downloader.S3ObjectStore', RetryDummyObjectStore): + with patch('composer.utils.object_store.utils.S3ObjectStore', RetryDummyObjectStore): with patch('composer.loggers.remote_uploader_downloader.multiprocessing.get_context', lambda _: fork_context): remote_uploader_downloader = RemoteUploaderDownloader( bucket_uri=f"s3://{tmp_path}/'object_store_backend", @@ -263,7 +263,7 @@ def test_race_with_overwrite(tmp_path: pathlib.Path, use_procs: bool, dummy_stat # Patching does not work when using multiprocessing with spawn, so we also # patch to use fork fork_context = multiprocessing.get_context('fork') - with patch('composer.loggers.remote_uploader_downloader.S3ObjectStore', DummyObjectStore): + with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore): with patch('composer.loggers.remote_uploader_downloader.multiprocessing.get_context', lambda _: fork_context): # Create the object store logger remote_uploader_downloader = RemoteUploaderDownloader( @@ -307,7 +307,7 @@ def test_race_with_overwrite(tmp_path: pathlib.Path, use_procs: bool, dummy_stat def test_close_on_failure(tmp_path: pathlib.Path, dummy_state: State): """Test that .close() and .post_close() does not hang even when a worker crashes.""" - with patch('composer.loggers.remote_uploader_downloader.S3ObjectStore', DummyObjectStore): + with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore): # Create the object store logger remote_uploader_downloader = RemoteUploaderDownloader( bucket_uri=f"s3://{tmp_path}/'object_store_backend", @@ -355,9 +355,9 @@ def test_close_on_failure(tmp_path: pathlib.Path, dummy_state: State): def test_valid_backend_names(): valid_backend_names = ['s3', 'libcloud', 'sftp'] - with patch('composer.loggers.remote_uploader_downloader.S3ObjectStore') as _, \ - patch('composer.loggers.remote_uploader_downloader.SFTPObjectStore') as _, \ - patch('composer.loggers.remote_uploader_downloader.LibcloudObjectStore') as _: + with patch('composer.utils.object_store.utils.S3ObjectStore') as _, \ + patch('composer.utils.object_store.utils.SFTPObjectStore') as _, \ + patch('composer.utils.object_store.utils.LibcloudObjectStore') as _: for name in valid_backend_names: remote_uploader_downloader = RemoteUploaderDownloader(bucket_uri=f'{name}://not-a-real-bucket') # Access the remote_backend property so that it is built @@ -374,7 +374,7 @@ def test_valid_backend_names(): def test_exception_queue_works(tmp_path: pathlib.Path, dummy_state: State): """Test that exceptions get put on the exception queue and get thrown""" - with patch('composer.loggers.remote_uploader_downloader.S3ObjectStore', DummyObjectStore): + with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore): # Create the object store logger remote_uploader_downloader = RemoteUploaderDownloader( bucket_uri=f"s3://{tmp_path}/'object_store_backend", diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py index 9912563eb8..ede864d13b 100644 --- a/tests/trainer/test_checkpoint.py +++ b/tests/trainer/test_checkpoint.py @@ -4,6 +4,7 @@ import contextlib import copy import io +import multiprocessing import os import pathlib import re @@ -25,12 +26,11 @@ from composer.algorithms import NoOpModel from composer.callbacks import CheckpointSaver from composer.core import Callback, Time, TimeUnit -from composer.loggers import RemoteUploaderDownloader, remote_uploader_downloader from composer.metrics import MAP from composer.optim import ExponentialScheduler from composer.trainer import trainer from composer.trainer.trainer import Trainer -from composer.utils import dist, is_tar, reproducibility +from composer.utils import dist, is_tar, remote_uploader, reproducibility from composer.utils.checkpoint import ( _COMPOSER_STATES_FILENAME, PartialFilePath, @@ -52,6 +52,7 @@ device, ) from tests.common.markers import world_size +from tests.utils.test_remote_uploader import DummyObjectStore class DummyStatefulCallback(Callback): @@ -309,30 +310,6 @@ def get_trainer(self, **kwargs): model = SimpleConvModel() return Trainer(model=model, **kwargs) - @pytest.mark.parametrize('add_remote_ud', [True, False]) - def test_s3_uri_creates_remote_ud(self, add_remote_ud: bool, monkeypatch: MonkeyPatch): - mock_validate_credentials = MagicMock() - monkeypatch.setattr(remote_uploader_downloader, '_validate_credentials', mock_validate_credentials) - if add_remote_ud: - with pytest.warns(UserWarning): - trainer = self.get_trainer( - save_folder='s3://bucket_name/{run_name}/checkpoints', - loggers=[ - RemoteUploaderDownloader('s3://bucket_name', file_path_format_string='{remote_file_name}'), - ], - ) - else: - trainer = self.get_trainer(save_folder='s3://bucket_name/{run_name}/checkpoints') - - remote_uds = [ - logger_dest for logger_dest in trainer.logger.destinations - if isinstance(logger_dest, RemoteUploaderDownloader) - ] - assert len(remote_uds) == 1 - remote_ud = remote_uds[0] - assert remote_ud.remote_backend_name == 's3' - assert remote_ud.remote_bucket_name == 'bucket_name' - @pytest.mark.parametrize('uri', ['wandb://foo/bar', 'gcs://foo/bar', 'sftp://foo/bar"']) def test_other_uris_error_out(self, uri: str): with pytest.raises(NotImplementedError): @@ -394,7 +371,7 @@ def test_checkpoint_saver_properly_constructed( monkeypatch: MonkeyPatch, ): mock_validate_credentials = MagicMock() - monkeypatch.setattr(remote_uploader_downloader, '_validate_credentials', mock_validate_credentials) + monkeypatch.setattr(remote_uploader, 'validate_credentials', mock_validate_credentials) trainer = self.get_trainer(save_folder=save_folder) @@ -646,6 +623,71 @@ def test_checkpoint_multiple_callbacks( assert id(trainer._checkpoint_saver) == id(checkpoint_savers[0]) assert len([cb for cb in trainer.state.callbacks if isinstance(cb, CheckpointSaver)]) == len(checkpoint_savers) + @pytest.mark.parametrize(('upload_success'), [True, False]) + def test_checkpoint_remote_symlink( + self, + upload_success: bool, + ): + import multiprocessing + fork_context = multiprocessing.get_context('fork') + tmp_dir = tempfile.TemporaryDirectory() + + def _get_tmp_dir(self): + return tmp_dir + + class _AlwaysFailDummyObjectStore(DummyObjectStore): + + def upload_object(self, object_name, filename, callback=None): + # Only allows to upload symlink to simulate + # the situation that checkpoint file uploading fails + if 'symlink' in object_name or 'credentials_validated_successfully' in object_name: + return super().upload_object(object_name, filename, callback) + raise RuntimeError('Raise Error intentionally') + + if upload_success: + MockObjectStore = DummyObjectStore + else: + MockObjectStore = _AlwaysFailDummyObjectStore + + with patch('composer.utils.object_store.utils.S3ObjectStore', MockObjectStore): + with patch('tests.utils.test_remote_uploader.DummyObjectStore.get_tmp_dir', _get_tmp_dir): + with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context): + train_dataset = RandomClassificationDataset(size=10) + train_dataloader = DataLoader( + dataset=train_dataset, + batch_size=2, + sampler=dist.get_sampler(train_dataset), + ) + + trainer = Trainer( + model=SimpleModel(), + train_dataloader=train_dataloader, + save_interval='1ba', + max_duration='1ba', + save_folder='S3://whatever/', + ) + symlink_filepath = os.path.join(tmp_dir.name, 'latest-rank0.pt.symlink') + if upload_success: + trainer.fit() + with open(symlink_filepath, 'r') as f: + assert f.read() == 'ep0-ba1-rank0.pt' + else: + assert trainer._checkpoint_saver is not None + trainer._checkpoint_saver._symlink_upload_wait_before_next_try_in_seconds = 0.01 + trainer._checkpoint_saver.upload_timeout_in_seconds = 1 + with pytest.raises(RuntimeError, match='Raise Error intentionally'): + trainer.fit() + assert os.path.exists(symlink_filepath) == False + + def post_close(self): + return + + assert trainer._checkpoint_saver is not None + trainer._checkpoint_saver.post_close = post_close.__get__( + trainer._checkpoint_saver, + CheckpointSaver, + ) + class TestCheckpointLoading: @@ -709,25 +751,6 @@ def get_trainer( **kwargs, ) - def get_logger(self, tmp_path: pathlib.Path): - """Returns an object store logger that saves locally.""" - remote_dir = str(tmp_path / 'object_store') - os.makedirs(remote_dir, exist_ok=True) - - return RemoteUploaderDownloader( - bucket_uri='libcloud://.', - backend_kwargs={ - 'provider': 'local', - 'container': '.', - 'provider_kwargs': { - 'key': remote_dir, - }, - }, - num_concurrent_uploads=1, - use_procs=False, - upload_staging_folder=str(tmp_path / 'staging_folder'), - ) - @world_size(1, 2) @device('cpu', 'gpu') @pytest.mark.parametrize('use_object_store', [True, False]) @@ -758,9 +781,6 @@ def test_autoresume( if delete_local and not use_object_store: pytest.skip('Invalid test setting.') - if use_object_store: - pytest.importorskip('libcloud') - latest_filename = 'latest-rank{rank}' + file_extension if test_slashed: latest_filename = 'testdir/' + latest_filename @@ -768,51 +788,68 @@ def test_autoresume( if is_compressed_pt(latest_filename) and not get_compressor(latest_filename).exists: pytest.skip(reason=f'compressor not found for {latest_filename}') - trainer_1 = self.get_trainer( - latest_filename=latest_filename, - file_extension=file_extension, - save_folder='first', - device=device, - run_name='big-chungus', - autoresume=True, - loggers=[self.get_logger(tmp_path)] if use_object_store else [], - save_metrics=save_metrics, - ) - - # trains the model, saving the checkpoint files - trainer_1.fit() - trainer_1.close() - - if delete_local: - # delete files locally, forcing trainer to look in object store - shutil.rmtree('first') - - trainer_2 = self.get_trainer( - latest_filename=latest_filename, - save_folder='first', - device=device, - run_name='big-chungus', - autoresume=True, - load_path='ignore_me.pt', # this should be ignored - load_ignore_keys=['*'], # this should be ignored - save_overwrite=save_overwrite, - loggers=[self.get_logger(tmp_path)] if use_object_store else [], - ) - - self._assert_weights_equivalent( - trainer_1.state.model, - trainer_2.state.model, - ) - - if save_metrics: - assert self._metrics_equal( - trainer_1.state.train_metrics, - trainer_2.state.train_metrics, - trainer_1.state.eval_metrics, - trainer_2.state.eval_metrics, - ), 'Original metrics do not equal metrics from loaded checkpoint.' - - assert trainer_1.state.run_name == trainer_2.state.run_name + if use_object_store: + save_folder = 's3://bucket_name/first' + else: + save_folder = 'first' + + # Mock S3 object store + fork_context = multiprocessing.get_context('fork') + tmp_dir = tempfile.TemporaryDirectory() + + def _get_tmp_dir(self): + return tmp_dir + + with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore): + with patch('tests.utils.test_remote_uploader.DummyObjectStore.get_tmp_dir', _get_tmp_dir): + with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context): + + trainer_1 = self.get_trainer( + latest_filename=latest_filename, + file_extension=file_extension, + save_folder=save_folder, + device=device, + run_name='big-chungus', + autoresume=True, + save_metrics=save_metrics, + ) + if use_object_store: + assert trainer_1._checkpoint_saver is not None + trainer_1._checkpoint_saver._symlink_upload_wait_before_next_try_in_seconds = 0.01 + + # trains the model, saving the checkpoint files + trainer_1.fit() + trainer_1.close() + + if delete_local: + # delete files locally, forcing trainer to look in object store + shutil.rmtree('first') + + trainer_2 = self.get_trainer( + latest_filename=latest_filename, + save_folder=save_folder, + device=device, + run_name='big-chungus', + autoresume=True, + load_path='ignore_me.pt', # this should be ignored + load_ignore_keys=['*'], # this should be ignored + save_overwrite=save_overwrite, + ) + + self._assert_weights_equivalent( + trainer_1.state.model, + trainer_2.state.model, + ) + + if save_metrics: + assert self._metrics_equal( + trainer_1.state.train_metrics, + trainer_2.state.train_metrics, + trainer_1.state.eval_metrics, + trainer_2.state.eval_metrics, + ), 'Original metrics do not equal metrics from loaded checkpoint.' + + assert trainer_1.state.run_name == trainer_2.state.run_name @pytest.mark.parametrize(('save_folder'), [None, 'first']) def test_autoresume_from_callback( @@ -862,7 +899,7 @@ def test_autoresume_from_callback( def test_load_from_uri(self, load_path: str, load_object_store: Optional[ObjectStore], monkeypatch: MonkeyPatch): mock_validate_credentials = MagicMock() - monkeypatch.setattr(remote_uploader_downloader, '_validate_credentials', mock_validate_credentials) + monkeypatch.setattr(remote_uploader, 'validate_credentials', mock_validate_credentials) mock_load_checkpoint = MagicMock() monkeypatch.setattr(trainer.checkpoint, 'load_checkpoint', mock_load_checkpoint) self.get_trainer(load_path=load_path, load_object_store=load_object_store) @@ -882,7 +919,7 @@ def test_load_from_uri(self, load_path: str, load_object_store: Optional[ObjectS ) def test_other_backends_error(self, load_path: str, monkeypatch: MonkeyPatch): mock_validate_credentials = MagicMock() - monkeypatch.setattr(remote_uploader_downloader, '_validate_credentials', mock_validate_credentials) + monkeypatch.setattr(remote_uploader, 'validate_credentials', mock_validate_credentials) with pytest.raises(NotImplementedError): self.get_trainer(load_path=load_path) @@ -1197,29 +1234,37 @@ def _stateful_callbacks_equal(self, callbacks1, callbacks2): return cb1.random_value == cb2.random_value def test_load_weights_object_store(self, tmp_path): - - pytest.importorskip('libcloud') - - trainer_1 = self.get_trainer( - save_folder='{run_name}/checkpoints', - loggers=[self.get_logger(tmp_path)], - run_name='electric-zebra', - ) - trainer_1.fit() - trainer_1.close() - - trainer_2 = self.get_trainer( - loggers=[self.get_logger(tmp_path)], - run_name='electric-zebra', - load_path='electric-zebra/checkpoints/latest-rank0.pt', - load_object_store=self.get_logger(tmp_path), - ) - - # check weights loaded properly - self._assert_weights_equivalent( - trainer_1.state.model, - trainer_2.state.model, - ) + # Mock S3 object store + fork_context = multiprocessing.get_context('fork') + tmp_dir = tempfile.TemporaryDirectory() + + def _get_tmp_dir(self): + return tmp_dir + + with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore): + with patch('tests.utils.test_remote_uploader.DummyObjectStore.get_tmp_dir', _get_tmp_dir): + with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context): + save_folder = 's3://my_bucket/{run_name}/checkpoints' + trainer_1 = self.get_trainer( + save_folder=save_folder, + run_name='electric-zebra', + ) + assert trainer_1._checkpoint_saver is not None + trainer_1._checkpoint_saver._symlink_upload_wait_before_next_try_in_seconds = 0.01 + trainer_1.fit() + trainer_1.close() + + trainer_2 = self.get_trainer( + run_name='electric-zebra', + load_path='electric-zebra/checkpoints/latest-rank0.pt', + load_object_store=DummyObjectStore(), + ) + + # check weights loaded properly + self._assert_weights_equivalent( + trainer_1.state.model, + trainer_2.state.model, + ) @pytest.mark.parametrize( 'run_name,save_folder,latest_filename', diff --git a/tests/utils/test_remote_uploader.py b/tests/utils/test_remote_uploader.py index 847abb369d..2e41e91d18 100644 --- a/tests/utils/test_remote_uploader.py +++ b/tests/utils/test_remote_uploader.py @@ -20,7 +20,7 @@ class DummyObjectStore(ObjectStore): """Dummy ObjectStore implementation that is backed by a local directory.""" def __init__(self, **kwargs: Dict[str, Any]) -> None: - self.tmp_dir = tempfile.TemporaryDirectory() + self.tmp_dir = self.get_tmp_dir() self.root = self.tmp_dir.name self.sleep_sec = 0 self.dest_filename = '' @@ -28,6 +28,9 @@ def __init__(self, **kwargs: Dict[str, Any]) -> None: def raise_error(self): return False + def get_tmp_dir(self): + return tempfile.TemporaryDirectory() + def upload_object( self, object_name: str, @@ -38,6 +41,7 @@ def upload_object( raise RuntimeError('Raise Error intentionally') time.sleep(self.sleep_sec) dest_filename = pathlib.Path(self.root) / object_name + os.makedirs(os.path.dirname(dest_filename), exist_ok=True) shutil.copy2(filename, dest_filename) self.dest_filename = dest_filename @@ -46,6 +50,16 @@ def get_object_size(self, object_name: str) -> int: size = os.stat(object_path).st_size return size + def download_object( + self, + object_name: str, + filename: Union[str, pathlib.Path], + overwrite: bool = False, + callback: Optional[Callable[[int, int], None]] = None, + ): + object_path = pathlib.Path(self.root) / object_name + shutil.copy2(object_path, filename) + def test_upload_mutliple_files(): fork_context = multiprocessing.get_context('fork') @@ -54,7 +68,7 @@ def test_upload_mutliple_files(): def _get_tmp_dir(): return tmp_dir - with patch('composer.utils.file_helpers.S3ObjectStore', DummyObjectStore): + with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore): with patch('tempfile.TemporaryDirectory', _get_tmp_dir): with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context): remote_uploader = RemoteUploader( @@ -99,7 +113,7 @@ def _get_tmp_dir(): return remote_tmp_dir fork_context = multiprocessing.get_context('fork') - with patch('composer.utils.file_helpers.S3ObjectStore', DummyObjectStore): + with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore): with patch('tempfile.TemporaryDirectory', _get_tmp_dir): with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context): remote_uploader = RemoteUploader(remote_folder='S3://whatever/path',) @@ -145,7 +159,7 @@ def raise_error(self): return True fork_context = multiprocessing.get_context('fork') - with patch('composer.utils.file_helpers.S3ObjectStore', AlwaysFailDummyObjectStore): + with patch('composer.utils.object_store.utils.S3ObjectStore', AlwaysFailDummyObjectStore): with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context): remote_uploader = RemoteUploader(remote_folder='S3://whatever/path',) tmp_dir = tempfile.TemporaryDirectory() @@ -168,7 +182,7 @@ def raise_error(self): def test_wait(): fork_context = multiprocessing.get_context('fork') - with patch('composer.utils.file_helpers.S3ObjectStore', DummyObjectStore): + with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore): with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context): remote_uploader = RemoteUploader( remote_folder='S3://whatever/path', @@ -197,7 +211,7 @@ def test_wait(): def test_wait_and_close(): fork_context = multiprocessing.get_context('fork') - with patch('composer.utils.file_helpers.S3ObjectStore', DummyObjectStore): + with patch('composer.utils.object_store.utils.S3ObjectStore', DummyObjectStore): with patch('composer.utils.remote_uploader.multiprocessing.get_context', lambda _: fork_context): remote_uploader = RemoteUploader( remote_folder='S3://whatever/path', From e951f0a81ed65ea3f607cf437b6387de4c7fc632 Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Mon, 8 Jul 2024 16:19:32 -0700 Subject: [PATCH 55/69] Correctly process `parallelism_config['tp']` when it's a dict (#3434) * big fix * testing * ignore * ignore * ignore * Update test_fsdp_checkpoint.py * Update test_fsdp_checkpoint.py --------- Co-authored-by: Mihir Patel --- composer/trainer/trainer.py | 2 +- tests/trainer/test_fsdp_checkpoint.py | 17 +++++++++++++++-- tests/trainer/test_tp.py | 1 + 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index c752187ba6..b410e8aa96 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1231,7 +1231,7 @@ def __init__( if isinstance(parallelism_config['tp'], TPConfig): parallelism_config_args['tp'] = parallelism_config['tp'] else: - parallelism_config['tp'] = TPConfig(**parallelism_config['tp']) + parallelism_config_args['tp'] = TPConfig(**parallelism_config['tp']) parallelism_config = ParallelismConfig( **parallelism_config_args, ) if len(parallelism_config_args) > 0 else None diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py index 154ed6b282..a59e60172a 100644 --- a/tests/trainer/test_fsdp_checkpoint.py +++ b/tests/trainer/test_fsdp_checkpoint.py @@ -30,7 +30,7 @@ from composer.models import ComposerClassifier from composer.optim import DecoupledAdamW from composer.trainer import Trainer -from composer.utils import FSDPConfig, dist, parse_uri +from composer.utils import FSDPConfig, TPConfig, dist, parse_uri from composer.utils.checkpoint import is_checkpoint_legacy_sharded from composer.utils.file_helpers import get_file from composer.utils.object_store import S3ObjectStore @@ -288,6 +288,7 @@ def _compare_timestamps_between_state_dicts(state_dict1, state_dict2): @pytest.mark.gpu @pytest.mark.filterwarnings(r'ignore:.*scatter_full_optim_state_dict``is being deprecated.*:UserWarning') +@pytest.mark.filterwarnings(r'ignore:.*\(TP\) is experimental.*:FutureWarning') @pytest.mark.parametrize( 'optimizer,autoresume,precision,save_weights_only,load_weights_only,load_monolith_rank0_only,use_tp,use_hsdp', [ @@ -315,7 +316,9 @@ def test_fsdp_full_state_dict_load( use_hsdp: bool, ): if use_hsdp: - pytest.xfail('Known Pytorch issue with HSDP, waiting for pytorch patch') + pytest.xfail('Known PyTorch issue with HSDP, waiting for pytorch patch') + if use_tp: + pytest.skip('TP on PyTorch 2.3 has full state dict issues.') if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'): pytest.skip('HSDP and TP require torch 2.3.0 or later') if autoresume: @@ -360,6 +363,11 @@ def test_fsdp_full_state_dict_load( fsdp_config=fsdp_config, tp_config=tp_config, ) + + if use_tp: + assert trainer1.state.tp_config is not None + assert isinstance(trainer1.state.tp_config, TPConfig) + trainer1.fit() state_dict_from_trainer1 = trainer1.state.state_dict() trainer1.close() @@ -511,6 +519,7 @@ def test_fsdp_mixed_with_sync( @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning') @pytest.mark.filterwarnings(r'ignore:.*The CUDA RNG state could not be loaded.*:UserWarning') @pytest.mark.filterwarnings(r'ignore:.*ShardedTensor.to only move tensor to its current device.*:UserWarning') +@pytest.mark.filterwarnings(r'ignore:.*\(TP\) is experimental.*:FutureWarning') def test_fsdp_load_old_checkpoint( world_size, tmp_path: pathlib.Path, @@ -748,6 +757,7 @@ def test_fsdp_full_state_dict_load_with_ema( @pytest.mark.filterwarnings(r'ignore:TypedStorage is deprecated.:UserWarning') @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning') @pytest.mark.filterwarnings(r'ignore:Please use DTensor instead and we are deprecating ShardedTensor.:UserWarning') +@pytest.mark.filterwarnings(r'ignore:.*\(TP\) is experimental.*:FutureWarning') def test_checkpoint_loading_with_validation(world_size, tmp_path, is_valid_checkpoint: bool, state_dict_type: str): # Set the error expectations. expectation = does_not_raise() @@ -818,6 +828,7 @@ def mock_get_checkpoint_validation_function(): @pytest.mark.filterwarnings(r'ignore:TypedStorage is deprecated.:UserWarning') @pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning') @pytest.mark.filterwarnings(r'ignore:Please use DTensor instead and we are deprecating ShardedTensor.:UserWarning') +@pytest.mark.filterwarnings(r'ignore:.*\(TP\) is experimental.*:FutureWarning') def test_fsdp_partitioned_state_dict_load( tmp_path: pathlib.Path, autoresume: bool, @@ -833,6 +844,8 @@ def test_fsdp_partitioned_state_dict_load( s3_ephemeral_prefix, request, ): + if use_tp: + pytest.skip('TP on PyTorch 2.3 has sharded state dict issues.') if weights_only and autoresume: pytest.skip('Weights only with autoresume is not supported') if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'): diff --git a/tests/trainer/test_tp.py b/tests/trainer/test_tp.py index 8146ebad40..bfee2e13c9 100644 --- a/tests/trainer/test_tp.py +++ b/tests/trainer/test_tp.py @@ -18,6 +18,7 @@ @pytest.mark.gpu @world_size(4) @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.3'), reason='requires PyTorch 2.3+') +@pytest.mark.filterwarnings(r'ignore:.*\(TP\) is experimental.*:FutureWarning') def test_tp_train(world_size: int): from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel From 6dec8359374dc81b1569e21744c0942ff91f0e72 Mon Sep 17 00:00:00 2001 From: bigning Date: Mon, 8 Jul 2024 22:11:22 -0700 Subject: [PATCH 56/69] [checkpoint v2] Download api (#3447) * a * a * fix lint and test * lint * comments * comment --- composer/checkpoint/__init__.py | 2 + composer/checkpoint/download.py | 85 +++++++++++++++++++++++++++++ composer/utils/__init__.py | 2 + composer/utils/file_helpers.py | 11 ++++ tests/checkpoint/test_download.py | 56 +++++++++++++++++++ tests/utils/test_remote_uploader.py | 2 + 6 files changed, 158 insertions(+) create mode 100644 composer/checkpoint/download.py create mode 100644 tests/checkpoint/test_download.py diff --git a/composer/checkpoint/__init__.py b/composer/checkpoint/__init__.py index d4b21c790d..33162fc5e6 100644 --- a/composer/checkpoint/__init__.py +++ b/composer/checkpoint/__init__.py @@ -3,6 +3,7 @@ """Module for checkpointing API.""" +from composer.checkpoint.download import download_monolithic_checkpoint from composer.checkpoint.state_dict import ( get_metadata_state_dict, get_model_state_dict, @@ -15,4 +16,5 @@ 'get_optim_state_dict', 'get_metadata_state_dict', 'get_resumption_state_dict', + 'download_monolithic_checkpoint', ] diff --git a/composer/checkpoint/download.py b/composer/checkpoint/download.py new file mode 100644 index 0000000000..01a80beb5f --- /dev/null +++ b/composer/checkpoint/download.py @@ -0,0 +1,85 @@ +# Copyright 2024 MosaicML Composer authors +# SPDX-License-Identifier: Apache-2.0 + +"""Useful functions for load checkpoints from remote object store or local disk.""" + +import logging +from typing import Optional + +from composer.utils import ( + dist, + extract_path_from_symlink, + maybe_create_object_store_from_uri, + parse_uri, + retry, +) + +log = logging.getLogger(__name__) + + +def download_file( + source_uri: str, + destination_path: str, + node_ranks: Optional[list[int]] = None, + num_attempts: int = 5, +): + """Downloads a file (object) from the specified URI to the specified directory. + + Args: + source_uri (str): The URI to download the file from or a symlink to the URI. + destination_path (str): The directory to download the file to. + node_ranks (list[int]): The ranks of the nodes that will download the file. If None, all nodes will download the file. + num_attempts (int): Retry for object store downloads. Default to 5. + """ + # Only local rank 0 downloads + local_rank = dist.get_local_rank() + if local_rank != 0: + return + + node_rank = dist.get_node_rank() + if node_ranks is not None and node_rank not in node_ranks: + return + + object_store = maybe_create_object_store_from_uri(source_uri) + _, _, source_path = parse_uri(source_uri) + if source_uri.endswith('.symlink'): + source_path = extract_path_from_symlink(source_path, object_store) + assert object_store is not None + + @retry(num_attempts=num_attempts) + def _download(): + object_store.download_object( + object_name=source_path, + filename=destination_path, + ) + + log.debug(f'Downloading {source_path} to {destination_path}') + _download() + log.debug(f'Finished downloading {source_path} to {destination_path}') + + +def download_monolithic_checkpoint( + source_uri: str, + destination_path: str, + global_rank_zero_only: bool = True, +): + """Downloads a monolithic checkpoint from the specified URI to the specified directory. + + Args: + source_uri (str): The URI to download the checkpoint from or symlink that points to the URI. + destination_path (str): The directory to download the checkpoint to. + global_rank_zero_only (bool): If True, only rank 0 will download the checkpoint. + broadcast_files_to_other_nodes (bool): If True, the downloaded checkpoint will be broadcast to all other nodes. + If torch syncs modules states this is unnecessary. + """ + node_ranks = None + if global_rank_zero_only: + node_ranks = [0] + download_file( + source_uri=source_uri, + destination_path=destination_path, + node_ranks=node_ranks, + ) + if not global_rank_zero_only or (global_rank_zero_only and dist.get_global_rank() == 0): + return destination_path + return None diff --git a/composer/utils/__init__.py b/composer/utils/__init__.py index 0850fd2bdd..20fa44e092 100644 --- a/composer/utils/__init__.py +++ b/composer/utils/__init__.py @@ -37,6 +37,7 @@ create_symlink_file, ensure_folder_has_no_conflicting_files, ensure_folder_is_empty, + extract_path_from_symlink, format_name_with_dist, format_name_with_dist_and_time, get_file, @@ -158,6 +159,7 @@ 'ParallelismConfig', 'MLFLOW_EXPERIMENT_ID_FORMAT_KEY', 'MLFLOW_RUN_ID_FORMAT_KEY', + 'extract_path_from_symlink', 'RemoteUploader', 'validate_credentials', 'build_remote_backend', diff --git a/composer/utils/file_helpers.py b/composer/utils/file_helpers.py index 11d10328ea..4f458b0a8e 100644 --- a/composer/utils/file_helpers.py +++ b/composer/utils/file_helpers.py @@ -49,6 +49,7 @@ 'maybe_create_object_store_from_uri', 'maybe_create_remote_uploader_downloader_from_uri', 'parse_uri', + 'extract_path_from_symlink', 'validate_credentials', ] @@ -57,6 +58,16 @@ def extract_path_from_symlink( source_path: str, object_store: Optional[Union[LoggerDestination, ObjectStore]] = None, ) -> str: + """Returns the checkpont path from symlink file. + + Args: + source_path(str): The remote symlink path. + object_store(LoggerDestination | ObjectStore, optional): The object store + used to download the remote symlink file + + Returns: + str: The content of the remote symlink file. + """ if object_store is not None: with tempfile.TemporaryDirectory() as tmpdir: _, _, source_path = parse_uri(source_path) diff --git a/tests/checkpoint/test_download.py b/tests/checkpoint/test_download.py new file mode 100644 index 0000000000..98c937bac4 --- /dev/null +++ b/tests/checkpoint/test_download.py @@ -0,0 +1,56 @@ +# Copyright 2024 MosaicML Composer authors +# SPDX-License-Identifier: Apache-2.0 + +import os +import tempfile +from unittest.mock import patch + +import pytest +import torch + +from composer.checkpoint import download_monolithic_checkpoint +from composer.utils import dist +from tests.checkpoint.helpers import init_model +from tests.common.markers import world_size +from tests.utils.test_remote_uploader import DummyObjectStore + + +@world_size(1, 2) +@pytest.mark.gpu +@pytest.mark.parametrize('rank_zero_only', [True, False]) +def test_download_monolithic_checkpoint(world_size: int, rank_zero_only: bool): + # Write a checkpoint + tmp_dir = tempfile.TemporaryDirectory() + use_fsdp = False + if world_size > 1: + use_fsdp = True + fsdp_model, _ = init_model(use_fsdp=use_fsdp) + + from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict + state = get_model_state_dict(fsdp_model, options=StateDictOptions(full_state_dict=True)) + + checkpoint_filename = 'state_dict' + save_filename = os.path.join(tmp_dir.name, checkpoint_filename) + if dist.get_global_rank() == 0: + torch.save(state, save_filename) + + class DummyS3ObjectStore(DummyObjectStore): + + def get_tmp_dir(self): + return tmp_dir + + # Download a monolithic checkpoint + local_file_name = 'state_dict.download' + with patch('composer.utils.file_helpers.S3ObjectStore', DummyS3ObjectStore): + ret = download_monolithic_checkpoint( + source_uri=f's3://bucket_name/{checkpoint_filename}', + destination_path=local_file_name, + global_rank_zero_only=rank_zero_only, + ) + dist.barrier() + + if rank_zero_only and dist.get_global_rank() != 0: + assert ret == None + if dist.get_global_rank() == 0: + assert ret == local_file_name + assert os.path.isfile(local_file_name) == True diff --git a/tests/utils/test_remote_uploader.py b/tests/utils/test_remote_uploader.py index 2e41e91d18..100e64ecf0 100644 --- a/tests/utils/test_remote_uploader.py +++ b/tests/utils/test_remote_uploader.py @@ -57,6 +57,8 @@ def download_object( overwrite: bool = False, callback: Optional[Callable[[int, int], None]] = None, ): + if overwrite is False and os.path.isfile(filename): + raise FileExistsError(f'The file at {filename} already exists and overwrite is set to False.') object_path = pathlib.Path(self.root) / object_name shutil.copy2(object_path, filename) From 18795f14ebc19f668ddabce2059382b6b516ce93 Mon Sep 17 00:00:00 2001 From: Jane Zhang Date: Tue, 9 Jul 2024 12:45:04 -0700 Subject: [PATCH 57/69] removed exception from logger (#3464) --- composer/loggers/mosaicml_logger.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index 8b4ff5942a..d7c83b85fa 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -97,10 +97,6 @@ def log_hyperparameters(self, hyperparameters: dict[str, Any]): def log_metrics(self, metrics: dict[str, Any], step: Optional[int] = None) -> None: self.log_metadata(metrics) - def log_exception(self, exception: Exception): - self.log_metadata({'exception': exception_to_json_serializable_dict(exception)}) - self._flush_metadata(force_flush=True) - def after_load(self, state: State, logger: Logger) -> None: # Log model data downloaded and initialized for run events log.debug(f'Logging model initialized time to metadata') From 11bad573d28b4f8a362257fc81598577553bbed5 Mon Sep 17 00:00:00 2001 From: Jack Zhang <170473087+JackZ-db@users.noreply.github.com> Date: Thu, 11 Jul 2024 12:19:05 -0700 Subject: [PATCH 58/69] fixed docs for mfu (#3469) --- composer/callbacks/speed_monitor.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/composer/callbacks/speed_monitor.py b/composer/callbacks/speed_monitor.py index 2098036297..6320e06562 100644 --- a/composer/callbacks/speed_monitor.py +++ b/composer/callbacks/speed_monitor.py @@ -223,10 +223,11 @@ class SpeedMonitor(Callback): | `throughput/device/flops_per_sec` | logged when model has attribute `flops_per_batch` | | | | +-------------------------------------+-----------------------------------------------------------+ - | | `throughput/device/flops_per_sec` divided by world size. | - | `throughput/device/mfu` | Only logged when model has attribute `flops_per_batch` | - | | and `gpu_flops_available`, which can be passed as an | - | | argument if not automatically determined by SpeedMonitor | + | | `throughput/device/flops_per_sec` divided by flops | + | | available on the GPU device. Only logged when model has | + | `throughput/device/mfu` | attribute `flops_per_batch` and `gpu_flops_available`, | + | | which can be passed as an argument if not automatically | + | | determined by SpeedMonitor | +-------------------------------------+-----------------------------------------------------------+ | `time/train` | Total elapsed training time | +-------------------------------------+-----------------------------------------------------------+ From 74c7d3bed8dbab110296dc5827dffb8de023a576 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Thu, 11 Jul 2024 12:51:37 -0700 Subject: [PATCH 59/69] add comment (#3470) --- composer/trainer/trainer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index b410e8aa96..501bc4bf09 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1652,6 +1652,7 @@ def __init__( # TP wrap if self.state.tp_config is not None: + # Init with globally fixed seed so all HSDP replicas have the same initial weights with reproducibility.seed_context(self.state.rank_zero_seed): prepare_tp_module( model, @@ -1660,6 +1661,7 @@ def __init__( # FSDP wrap if not using monolith checkpoint on rank 0 only if self.state.fsdp_config is not None and self.state.fsdp_config.auto_wrap and not self.state.load_monolith_rank0_only: + # Init with globally fixed seed so all HSDP replicas have the same initial weights with reproducibility.seed_context(self.state.rank_zero_seed): prepare_fsdp_module( model, @@ -1829,6 +1831,7 @@ def __init__( not self.state.fsdp_enabled and self.state.fsdp_config is not None and self.state.fsdp_config.auto_wrap and self.state.load_monolith_rank0_only ): + # Init with globally fixed seed so all HSDP replicas have the same initial weights with reproducibility.seed_context(self.state.rank_zero_seed): prepare_fsdp_module(model, optimizers, self.state.fsdp_config, precision, device, auto_microbatching) From 14bc187d82c9509c52bbf65d2614e6141e4701d1 Mon Sep 17 00:00:00 2001 From: Charles Tang Date: Thu, 11 Jul 2024 13:28:47 -0700 Subject: [PATCH 60/69] Change pytorch eval for FP8 to default to fall back to BF16 (#3454) --- composer/core/precision.py | 4 +++- composer/trainer/trainer.py | 26 +++++++++++++++++++++----- tests/fixtures/fixtures.py | 3 ++- tests/trainer/test_trainer_eval.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 7 deletions(-) diff --git a/composer/core/precision.py b/composer/core/precision.py index ea08a10c56..bb91fc64d1 100644 --- a/composer/core/precision.py +++ b/composer/core/precision.py @@ -40,6 +40,7 @@ class Precision(StringEnum): def get_precision_context( precision: Union[str, Precision], precision_config: Optional[dict[str, Any]] = None, + fp8_autocast_enabled: bool = True, ) -> Generator[None, None, None]: """Returns a context manager to automatically cast to a specific precision. @@ -47,6 +48,7 @@ def get_precision_context( precision (str | Precision): Precision for the context precision_config (Optional[dict[str, Any]]): Config for FP8 scaling strategy. See parameters for `DelayedScaling `_. + fp8_autocast_enabled (bool): Whether to enable FP8 autocast. Defaults to True. """ precision = Precision(precision) if precision == Precision.FP32: @@ -86,7 +88,7 @@ def get_precision_context( 'amax_compute_algo': 'max', } fp8_recipe = DelayedScaling(**precision_config) - with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe): + with te.fp8_autocast(enabled=fp8_autocast_enabled, fp8_recipe=fp8_recipe): # The te.onnx_export flag ensures that we save all fp8 buffers # as tensors instead of bytes. This is necessary for proper # saving and resumption of checkpoints. diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 501bc4bf09..b62b3d3e58 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -460,10 +460,15 @@ def _get_ddp_sync_strategy(ddp_sync_strategy: Optional[Union[str, DDPSyncStrateg return ddp_sync_strategy -def _get_precision_context(precision: Precision, precision_config: Optional[dict[str, Any]], deepspeed_enabled: bool): +def _get_precision_context( + precision: Precision, + precision_config: Optional[dict[str, Any]], + deepspeed_enabled: bool, + fp8_autocast_enabled: bool = True, +): if deepspeed_enabled: return contextlib.nullcontext() - return get_precision_context(precision, precision_config) + return get_precision_context(precision, precision_config, fp8_autocast_enabled) def _generate_run_name() -> str: @@ -2675,10 +2680,15 @@ def _train_loop(self) -> None: def _eval_train_metrics(self, device_batch): assert self._train_data_spec is not None, 'The train data spec should be set on __init__ or fit()' assert self.state.train_metrics is not None, 'The train metrics should be set on __init__ or fit()' - + # We disable FP8 autocast in eval metrics and default to the activation dtype for the forward pass + # This is because FP8 in TE requires all eval data sizes to be divisible by 16 which does not hold for all evaluation datasets. + # See https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html for more info. + # Note: the activation dtype is BF16 if FSDP Mixed Precision PURE is enabled and FP32 if FSDP Mixed Precision FULL is enabled. + # See https://github.com/NVIDIA/TransformerEngine/blob/8e039fdcd98fc56582d81e373880c1509c2b8f73/transformer_engine/pytorch/module/linear.py#L250-L252 and \ + # https://github.com/NVIDIA/TransformerEngine/blob/8e039fdcd98fc56582d81e373880c1509c2b8f73/transformer_engine/pytorch/module/base.py#L495-L513 for more info. with torch.no_grad(),\ model_eval_mode(self.state.model),\ - _get_precision_context(self.state.precision, self.state.precision_config, self.state.deepspeed_enabled): + _get_precision_context(self.state.precision, self.state.precision_config, self.state.deepspeed_enabled, fp8_autocast_enabled=False): eval_outputs = self._original_model.eval_forward(device_batch, self.state.outputs) for metric in self.state.train_metrics.values(): self._original_model.update_metric( @@ -3473,11 +3483,17 @@ def _eval_loop( )[0] self.engine.run_event(Event.EVAL_BEFORE_FORWARD) - + # We disable FP8 autocast in eval mode and default to the activation dtype for the forward pass + # This is because FP8 in TE requires all eval data sizes to be divisible by 16 which does not hold for all evaluation datasets. + # See https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html for more info. + # Note: the activation dtype is BF16 if FSDP Mixed Precision PURE is enabled and FP32 if FSDP Mixed Precision FULL is enabled. + # See https://github.com/NVIDIA/TransformerEngine/blob/8e039fdcd98fc56582d81e373880c1509c2b8f73/transformer_engine/pytorch/module/linear.py#L250-L252 and \ + # https://github.com/NVIDIA/TransformerEngine/blob/8e039fdcd98fc56582d81e373880c1509c2b8f73/transformer_engine/pytorch/module/base.py#L495-L513 for more info. with _get_precision_context( self.state.precision, self.state.precision_config, self.state.deepspeed_enabled, + fp8_autocast_enabled=False, ): self.state.outputs = self._original_model.eval_forward(self.state.batch) diff --git a/tests/fixtures/fixtures.py b/tests/fixtures/fixtures.py index f587079073..c4dd3fa65f 100644 --- a/tests/fixtures/fixtures.py +++ b/tests/fixtures/fixtures.py @@ -14,7 +14,7 @@ from composer.core import State from composer.devices import DeviceCPU, DeviceGPU from composer.loggers import Logger -from composer.utils import dist +from composer.utils import dist, retry from tests.common import RandomClassificationDataset, SimpleModel from tests.conftest import _get_option @@ -310,6 +310,7 @@ def _session_tiny_t5_config(): # type: ignore return tiny_t5_config_helper() +@retry(num_attempts=3) def tiny_t5_tokenizer_helper(): transformers = pytest.importorskip('transformers') diff --git a/tests/trainer/test_trainer_eval.py b/tests/trainer/test_trainer_eval.py index b548efde81..9a2d8d6ab4 100644 --- a/tests/trainer/test_trainer_eval.py +++ b/tests/trainer/test_trainer_eval.py @@ -92,6 +92,36 @@ def test_eval_with_nondivisible_dataset(world_size: int, size: int, batch_size: assert count.item() == size +from unittest.mock import patch + + +@pytest.mark.gpu +def test_amp_fp8_eval_casts_to_bf16(): + # Check that we can import FP8 with TE. If not, skip this test. + try: + import transformer_engine # pyright: ignore + except ImportError: + pytest.skip('Precision amp_fp8 requires transformer-engine to be installed',) + + # Mocking the transformer_engine.pytorch.fp8_autocast and running model eval. + with patch('transformer_engine.pytorch.fp8_autocast') as mock_fp8_autocast: + # Construct the trainer + trainer = Trainer(model=SimpleModel(), device='gpu', precision='amp_fp8') + # Evaluate the model + dataset = RandomClassificationDataset() + trainer.eval(eval_dataloader=DataLoader( + dataset=dataset, + batch_size=10, + sampler=dist.get_sampler(dataset), + ),) + + # Check that te.fp8_autocast was called with enabled=False. + # This ensures that we disable the FP8 context on eval. + actual_call = mock_fp8_autocast.call_args_list[0] + actual_call_args = actual_call._get_call_arguments()[1] + assert actual_call_args['enabled'] is False + + def test_eval_call_with_trainer_evaluators(): trainer_dataset = RandomClassificationDataset() trainer_evaluator = Evaluator( From a5dc1555da1a1e9c7c4b707d2a66e8c244d614c6 Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:27:38 -0400 Subject: [PATCH 61/69] Fix checkpoint events (#3468) --- composer/callbacks/checkpoint_saver.py | 14 +++++++++++--- composer/loggers/remote_uploader_downloader.py | 8 +++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/composer/callbacks/checkpoint_saver.py b/composer/callbacks/checkpoint_saver.py index 29468e66c3..661b3046ba 100644 --- a/composer/callbacks/checkpoint_saver.py +++ b/composer/callbacks/checkpoint_saver.py @@ -11,11 +11,12 @@ import shutil import tempfile import textwrap +import time from pathlib import Path from typing import Any, Callable, Optional, Union from composer.core import Callback, Event, State, Time, Timestamp -from composer.loggers import Logger, MLFlowLogger +from composer.loggers import Logger, MLFlowLogger, MosaicMLLogger from composer.utils import ( FORMAT_NAME_WITH_DIST_AND_TIME_TABLE, FORMAT_NAME_WITH_DIST_TABLE, @@ -619,8 +620,13 @@ def _rotate_checkpoints(self, sharding_enabled: bool = False): if dist.get_global_rank() == 0: shutil.rmtree(prefix_dir) + def _log_checkpoint_upload(self, logger: Logger): + for destination in logger.destinations: + if isinstance(destination, MosaicMLLogger): + destination.log_metadata({'checkpoint_uploaded_time': time.time()}, force_flush=True) + def batch_end(self, state: State, logger: Logger) -> None: - del state, logger # unused + del state # unused if self.remote_uploader is None: return self.remote_uploader.check_workers() @@ -643,13 +649,14 @@ def batch_end(self, state: State, logger: Logger) -> None: file_path=local_symlink_file, overwrite=True, ) + self._log_checkpoint_upload(logger) break else: raise RuntimeError(f'Failed to check if checkpoint files upload finish: {result}') self.symlink_upload_tasks = undone_symlink_upload_tasks def fit_end(self, state: State, logger: Logger) -> None: - del state, logger # unused + del state # unused if self.remote_uploader is None: return log.info('Waiting for checkpoint uploading to finish') @@ -666,6 +673,7 @@ def fit_end(self, state: State, logger: Logger) -> None: overwrite=True, ) symlink_upload_future.result() + self._log_checkpoint_upload(logger) else: raise RuntimeError(f'Failed to check if checkpoint files upload finish: {result}') log.info('Checkpoint uploading finished!') diff --git a/composer/loggers/remote_uploader_downloader.py b/composer/loggers/remote_uploader_downloader.py index 9378d5a8d4..a143ac1421 100644 --- a/composer/loggers/remote_uploader_downloader.py +++ b/composer/loggers/remote_uploader_downloader.py @@ -22,7 +22,7 @@ import torch -from composer.loggers import Logger, MosaicMLLogger +from composer.loggers import Logger from composer.loggers.logger_destination import LoggerDestination from composer.utils import ( MLFlowObjectStore, @@ -308,12 +308,13 @@ def remote_backend(self) -> ObjectStore: return self._remote_backend def init(self, state: State, logger: Logger) -> None: + del logger # unused + if self._worker_flag is not None: raise RuntimeError('The RemoteUploaderDownloader is already initialized.') self._worker_flag = self._finished_cls() self._run_name = state.run_name file_name_to_test = self._remote_file_name('.credentials_validated_successfully') - self._logger = logger # Create the enqueue thread self._enqueue_thread_flag = self._finished_cls() @@ -426,9 +427,6 @@ def _enqueue_uploads(self): break self._enqueued_objects.remove(object_name) self._completed_queue.task_done() - for destination in self._logger.destinations: - if isinstance(destination, MosaicMLLogger): - destination.log_metadata({'checkpoint_uploaded_time': time.time()}, force_flush=True) # Enqueue all objects that are in self._logged_objects but not in self._file_upload_queue objects_to_delete = [] From 69b8b236b6705060130c7ca682a62528d3fba2ac Mon Sep 17 00:00:00 2001 From: Ethan Ma Date: Tue, 16 Jul 2024 12:55:52 -0700 Subject: [PATCH 62/69] Add mosaicmllogger attr for fit start (#3467) --- composer/loggers/mosaicml_logger.py | 5 +++++ tests/loggers/test_mosaicml_logger.py | 1 + 2 files changed, 6 insertions(+) diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py index d7c83b85fa..2de7243d31 100644 --- a/composer/loggers/mosaicml_logger.py +++ b/composer/loggers/mosaicml_logger.py @@ -135,6 +135,11 @@ def fit_end(self, state: State, logger: Logger) -> None: self.log_metadata(training_progress_data) self._flush_metadata(force_flush=True) + def fit_start(self, state: State, logger: Logger) -> None: + # Log model training started time for run events + self.log_metadata({'train_started_time': time.time()}) + self._flush_metadata(force_flush=True) + def eval_end(self, state: State, logger: Logger) -> None: self._flush_metadata(force_flush=True) diff --git a/tests/loggers/test_mosaicml_logger.py b/tests/loggers/test_mosaicml_logger.py index 795c8da56b..e308dab122 100644 --- a/tests/loggers/test_mosaicml_logger.py +++ b/tests/loggers/test_mosaicml_logger.py @@ -321,6 +321,7 @@ def test_run_events_logged(monkeypatch): assert metadata['mosaicml/training_progress'] == '[batch=4/4]' assert 'mosaicml/training_sub_progress' not in metadata assert isinstance(metadata['mosaicml/train_finished_time'], float) + assert isinstance(metadata['mosaicml/train_started_time'], float) def test_token_training_progress_metrics(): From 15c329e260c9bfe2770333f3e64a071fb5c60171 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 18 Jul 2024 17:25:28 +0000 Subject: [PATCH 63/69] Bump coverage[toml] from 7.5.4 to 7.6.0 (#3471) Bumps [coverage[toml]](https://github.com/nedbat/coveragepy) from 7.5.4 to 7.6.0. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.5.4...7.6.0) --- updated-dependencies: - dependency-name: coverage[toml] dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8f8498392d..eccba856fd 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def package_files(prefix: str, directory: str, extension: str): # Should manually update dependency versions occassionally. 'custom_inherit==2.4.1', 'junitparser==3.1.2', - 'coverage[toml]==7.5.4', + 'coverage[toml]==7.6.0', 'fasteners==0.18', # object store tests require fasteners 'pytest==7.4.4', 'ipython==8.11.0', From 8a09a3be711038ddf3bccb88402e2b438ed3208c Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Sun, 21 Jul 2024 15:18:15 -0700 Subject: [PATCH 64/69] Bump flash attention to 2.6.1 (#3476) --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 970af2f1ef..d854715568 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -261,7 +261,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ RUN if [ -n "$CUDA_VERSION" ] ; then \ pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ - MAX_JOBS=1 pip${PYTHON_VERSION} install --no-cache-dir --no-build-isolation flash-attn==2.5.8; \ + MAX_JOBS=1 pip${PYTHON_VERSION} install --no-cache-dir --no-build-isolation flash-attn==2.6.1; \ cd .. ; \ fi From 779ff3e9a218f6caa696b032677133938085e79d Mon Sep 17 00:00:00 2001 From: Kevin DeShawn Date: Mon, 22 Jul 2024 16:03:50 -0500 Subject: [PATCH 65/69] cpu --- .github/workflows/pr-cpu.yaml | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 23129715db..1e2d832e74 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -9,7 +9,8 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }} jobs: pytest-cpu: - uses: mosaicml/ci-testing/.github/workflows/pytest-cpu.yaml@v0.0.9 + name: ${{ matrix.name }} + runs-on: linux-ubuntu-latest strategy: matrix: include: @@ -29,20 +30,21 @@ jobs: container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py - name: ${{ matrix.name }} - if: github.repository_owner == 'mosaicml' - with: - composer_package_name: mosaicml - container: ${{ matrix.container }} - name: ${{ matrix.name }} - pip_deps: "[all]" - pytest-command: ${{ matrix.pytest_command }} - pytest-markers: ${{ matrix.markers }} - safe_directory: composer + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Run PR CPU Tests + uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.1.0 + with: + container: ${{ matrix.container }} + name: ${{ matrix.name }} + pip_deps: "[all]" + pytest_command: ${{ matrix.pytest_command }} + pytest_markers: ${{ matrix.markers }} + safe_directory: composer + composer_package_name: mosaicml coverage: uses: ./.github/workflows/coverage.yaml name: Coverage Results if: github.repository_owner == 'mosaicml' needs: [pytest-cpu] - with: - download-path: artifacts From 2c0eac2ad47bd667d68648b72def7cbba903d9f9 Mon Sep 17 00:00:00 2001 From: Kevin DeShawn Date: Mon, 22 Jul 2024 16:22:33 -0500 Subject: [PATCH 66/69] gpu --- .github/workflows/pr-gpu.yaml | 103 +++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 46 deletions(-) diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index f6de8908c1..08365b2262 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -9,7 +9,8 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }} jobs: pytest-gpu-1: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9 + name: ${{ matrix.name }} + runs-on: linux-ubuntu-latest strategy: matrix: include: @@ -18,24 +19,27 @@ jobs: markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: ${{ matrix.name }} if: github.repository_owner == 'mosaicml' - with: - composer_package_name: ${{ matrix.composer_package_name }} - container: ${{ matrix.container }} - git_repo: mosaicml/composer - mcloud-timeout: 2250 - name: ${{ matrix.name }} - pip_deps: "[all]" - pytest-command: ${{ matrix.pytest_command }} - pytest-markers: ${{ matrix.markers }} - python-version: 3.9 - gpu_num: 1 - secrets: - mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }} - + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Run PR GPU Tests + uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 + with: + composer_package_name: ${{ matrix.composer_package_name }} + container: ${{ matrix.container }} + git_repo: mosaicml/composer + mcloud_timeout: 2250 + name: ${{ matrix.name }} + pip_deps: "[all]" + pytest_command: ${{ matrix.pytest_command }} + pytest_markers: ${{ matrix.markers }} + python_version: 3.9 + gpu_num: 1 + mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }} pytest-gpu-2: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9 + name: ${{ matrix.name }} + runs-on: linux-ubuntu-latest strategy: matrix: include: @@ -44,25 +48,29 @@ jobs: markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: ${{ matrix.name }} if: github.repository_owner == 'mosaicml' - with: - composer_package_name: ${{ matrix.composer_package_name }} - container: ${{ matrix.container }} - git_repo: mosaicml/composer - mcloud-timeout: 2250 - name: ${{ matrix.name }} - pip_deps: "[all]" - pytest-command: ${{ matrix.pytest_command }} - pytest-markers: ${{ matrix.markers }} - python-version: 3.9 - gpu_num: 2 - secrets: - mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Run PR GPU Tests + uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 + with: + composer_package_name: ${{ matrix.composer_package_name }} + container: ${{ matrix.container }} + git_repo: mosaicml/composer + mcloud_timeout: 2250 + name: ${{ matrix.name }} + pip_deps: "[all]" + pytest_command: ${{ matrix.pytest_command }} + pytest_markers: ${{ matrix.markers }} + python_version: 3.9 + gpu_num: 2 + mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }} pytest-gpu-4: - uses: mosaicml/ci-testing/.github/workflows/pytest-gpu.yaml@v0.0.9 + name: ${{ matrix.name }} + runs-on: linux-ubuntu-latest strategy: matrix: include: @@ -71,18 +79,21 @@ jobs: markers: not daily and not remote and gpu and (doctest or not doctest) pytest_command: coverage run -m pytest composer_package_name: mosaicml - name: ${{ matrix.name }} if: github.repository_owner == 'mosaicml' - with: - composer_package_name: ${{ matrix.composer_package_name }} - container: ${{ matrix.container }} - git_repo: mosaicml/composer - mcloud-timeout: 2250 - name: ${{ matrix.name }} - pip_deps: "[all]" - pytest-command: ${{ matrix.pytest_command }} - pytest-markers: ${{ matrix.markers }} - python-version: 3.9 - gpu_num: 4 - secrets: - mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Run PR GPU Tests + uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 + with: + composer_package_name: ${{ matrix.composer_package_name }} + container: ${{ matrix.container }} + git_repo: mosaicml/composer + mcloud_timeout: 2250 + name: ${{ matrix.name }} + pip_deps: "[all]" + pytest_command: ${{ matrix.pytest_command }} + pytest_markers: ${{ matrix.markers }} + python_version: 3.9 + gpu_num: 4 + mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }} \ No newline at end of file From 2b3ff8c0cbee2e791f087aff92109cb48d50d8fa Mon Sep 17 00:00:00 2001 From: Kevin DeShawn Date: Mon, 22 Jul 2024 16:25:42 -0500 Subject: [PATCH 67/69] coverage fix --- .github/workflows/pr-cpu.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 1e2d832e74..1303fb54c9 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -48,3 +48,5 @@ jobs: name: Coverage Results if: github.repository_owner == 'mosaicml' needs: [pytest-cpu] + with: + download-path: artifacts From bd6515c3e091d4171c31c0bb6caae2be550c3ee1 Mon Sep 17 00:00:00 2001 From: Kevin DeShawn Date: Mon, 22 Jul 2024 17:18:14 -0500 Subject: [PATCH 68/69] lint --- .github/workflows/pr-cpu.yaml | 24 ++++----- .github/workflows/pr-gpu.yaml | 99 ++++++++++++++++++----------------- 2 files changed, 63 insertions(+), 60 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 1303fb54c9..2e0ac7b20c 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -31,18 +31,18 @@ jobs: markers: not daily and not remote and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py steps: - - name: Checkout code - uses: actions/checkout@v2 - - name: Run PR CPU Tests - uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.1.0 - with: - container: ${{ matrix.container }} - name: ${{ matrix.name }} - pip_deps: "[all]" - pytest_command: ${{ matrix.pytest_command }} - pytest_markers: ${{ matrix.markers }} - safe_directory: composer - composer_package_name: mosaicml + - name: Checkout code + uses: actions/checkout@v2 + - name: Run PR CPU Tests + uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.1.0 + with: + container: ${{ matrix.container }} + name: ${{ matrix.name }} + pip_deps: "[all]" + pytest_command: ${{ matrix.pytest_command }} + pytest_markers: ${{ matrix.markers }} + safe_directory: composer + composer_package_name: mosaicml coverage: uses: ./.github/workflows/coverage.yaml name: Coverage Results diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index 08365b2262..a2715a5844 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -21,22 +21,23 @@ jobs: composer_package_name: mosaicml if: github.repository_owner == 'mosaicml' steps: - - name: Checkout code - uses: actions/checkout@v3 - - name: Run PR GPU Tests - uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 - with: - composer_package_name: ${{ matrix.composer_package_name }} - container: ${{ matrix.container }} - git_repo: mosaicml/composer - mcloud_timeout: 2250 - name: ${{ matrix.name }} - pip_deps: "[all]" - pytest_command: ${{ matrix.pytest_command }} - pytest_markers: ${{ matrix.markers }} - python_version: 3.9 - gpu_num: 1 - mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }} + - name: Checkout code + uses: actions/checkout@v3 + - name: Run PR GPU Tests + uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 + with: + composer_package_name: ${{ matrix.composer_package_name }} + container: ${{ matrix.container }} + git_repo: mosaicml/composer + mcloud_timeout: 2250 + name: ${{ matrix.name }} + pip_deps: "[all]" + pytest_command: ${{ matrix.pytest_command }} + pytest_markers: ${{ matrix.markers }} + python_version: 3.9 + gpu_num: 1 + mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }} + ci_repo_gpu_test_ref: v0.1.0 pytest-gpu-2: name: ${{ matrix.name }} runs-on: linux-ubuntu-latest @@ -50,22 +51,23 @@ jobs: composer_package_name: mosaicml if: github.repository_owner == 'mosaicml' steps: - - name: Checkout code - uses: actions/checkout@v3 - - name: Run PR GPU Tests - uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 - with: - composer_package_name: ${{ matrix.composer_package_name }} - container: ${{ matrix.container }} - git_repo: mosaicml/composer - mcloud_timeout: 2250 - name: ${{ matrix.name }} - pip_deps: "[all]" - pytest_command: ${{ matrix.pytest_command }} - pytest_markers: ${{ matrix.markers }} - python_version: 3.9 - gpu_num: 2 - mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }} + - name: Checkout code + uses: actions/checkout@v3 + - name: Run PR GPU Tests + uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 + with: + composer_package_name: ${{ matrix.composer_package_name }} + container: ${{ matrix.container }} + git_repo: mosaicml/composer + mcloud_timeout: 2250 + name: ${{ matrix.name }} + pip_deps: "[all]" + pytest_command: ${{ matrix.pytest_command }} + pytest_markers: ${{ matrix.markers }} + python_version: 3.9 + gpu_num: 2 + mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }} + ci_repo_gpu_test_ref: v0.1.0 pytest-gpu-4: @@ -81,19 +83,20 @@ jobs: composer_package_name: mosaicml if: github.repository_owner == 'mosaicml' steps: - - name: Checkout code - uses: actions/checkout@v3 - - name: Run PR GPU Tests - uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 - with: - composer_package_name: ${{ matrix.composer_package_name }} - container: ${{ matrix.container }} - git_repo: mosaicml/composer - mcloud_timeout: 2250 - name: ${{ matrix.name }} - pip_deps: "[all]" - pytest_command: ${{ matrix.pytest_command }} - pytest_markers: ${{ matrix.markers }} - python_version: 3.9 - gpu_num: 4 - mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }} \ No newline at end of file + - name: Checkout code + uses: actions/checkout@v3 + - name: Run PR GPU Tests + uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 + with: + composer_package_name: ${{ matrix.composer_package_name }} + container: ${{ matrix.container }} + git_repo: mosaicml/composer + mcloud_timeout: 2250 + name: ${{ matrix.name }} + pip_deps: "[all]" + pytest_command: ${{ matrix.pytest_command }} + pytest_markers: ${{ matrix.markers }} + python_version: 3.9 + gpu_num: 4 + mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }} + ci_repo_gpu_test_ref: v0.1.0 \ No newline at end of file From 4488555a93498ffa28511630a787746a85cd47c5 Mon Sep 17 00:00:00 2001 From: Kevin DeShawn <126115026+KevDevSha@users.noreply.github.com> Date: Mon, 22 Jul 2024 17:53:20 -0500 Subject: [PATCH 69/69] Update pr-cpu.yaml --- .github/workflows/pr-cpu.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 2e0ac7b20c..9636f87352 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -1,6 +1,6 @@ name: PR CPU tests on: - pull_request: + pull_request_target: workflow_dispatch: # Cancel old runs when a new commit is pushed to the same branch if not on main # or dev