From 081dabb0c5bdf297e0082c4c8a0f7e15070b2eca Mon Sep 17 00:00:00 2001 From: Evan Racah Date: Tue, 6 Aug 2024 13:21:08 -0700 Subject: [PATCH 01/12] Fix autoresume docstring (save_overwrite) (#3526) save_overwrite is no longer required to be false for autoresume --- composer/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 8b1c6d8f93..b2f829ca10 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -975,7 +975,7 @@ class Trainer: (default: ``False``) autoresume (bool, optional): Whether or not to enable autoresume, which allows for stopping and resuming training. This allows use of spot instances, as the training run is now fault tolerant. This parameter - requires ``save_folder`` and ``run_name`` to be specified and ``save_overwrite`` to be ``False``. + requires ``save_folder`` and ``run_name`` to be specified. (default: ``False``) When enabled, the save_folder is checked for checkpoints of the format ``"{save_folder}/{save_latest_filename}"``, From bd7227c0c8205534bcd6ed17f646ad0a0267a2b3 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 7 Aug 2024 09:45:34 -0700 Subject: [PATCH 02/12] Unpin pip (#3524) --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 12ea07bd51..80ae8bad2e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -172,7 +172,7 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \ rm -rf /var/lib/apt/lists/* RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \ - pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' 'setuptools<70.0.0' + pip${PYTHON_VERSION} install --no-cache-dir --upgrade pip 'setuptools<70.0.0' ################# # Install Pytorch From a15b18ce18db6df4cff4e10251adcb9d5c5845db Mon Sep 17 00:00:00 2001 From: Charles Tang Date: Wed, 7 Aug 2024 10:18:54 -0700 Subject: [PATCH 03/12] Add FSDP input validation for use_orig_params and activation_cpu_offload flag (#3515) --- composer/core/state.py | 4 ++++ tests/trainer/test_fsdp.py | 22 ++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/composer/core/state.py b/composer/core/state.py index 5c429a1cd4..ca20dd1011 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -640,6 +640,10 @@ def _validate_parallelism_configs(self): if error_message != '': raise ValueError(error_message) + # Validate FSDP config parameters. + if self.fsdp_config and self.fsdp_config.activation_cpu_offload and not self.fsdp_config.use_orig_params: + raise ValueError('activation_cpu_offload=True is not supported with use_orig_params=False.') + # Validate FSDP state dict type if self.fsdp_state_dict_type not in [None, 'full', 'sharded']: if self.fsdp_state_dict_type == 'local': diff --git a/tests/trainer/test_fsdp.py b/tests/trainer/test_fsdp.py index 4c936f5402..7b9bd4825c 100644 --- a/tests/trainer/test_fsdp.py +++ b/tests/trainer/test_fsdp.py @@ -621,6 +621,28 @@ def test_fsdp_shard(world_size: int): ) +@pytest.mark.gpu +@world_size(2) +def test_fsdp_invalid_config_throws_error(world_size: int): + model = SimpleModel() + model.fc1._fsdp_wrap = True # pyright: ignore[reportGeneralTypeIssues] + model.fc2._fsdp_wrap = True # pyright: ignore[reportGeneralTypeIssues] + + expected_error = 'activation_cpu_offload=True is not supported with use_orig_params=False.' + + with pytest.raises(ValueError, match=expected_error): + _ = Trainer( + model=model, + parallelism_config={ + 'fsdp': { + 'use_orig_params': False, + 'activation_cpu_offload': True, + }, + }, + max_duration='3ba', + ) + + @pytest.mark.gpu @world_size(2) def test_fsdp_shard_and_replicate(world_size: int): From 4a9756f11c51dae01daba6c3bddf9d94169f1d89 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Thu, 8 Aug 2024 11:23:35 -0700 Subject: [PATCH 04/12] hasattr check for Wandb 0.17.6 (#3531) --- .github/workflows/daily.yaml | 3 --- composer/loggers/wandb_logger.py | 6 +++++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index aa97c755c8..d7616ffbbf 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -77,13 +77,10 @@ jobs: pytest-command: ${{ matrix.pytest_command }} pytest-markers: ${{ matrix.markers }} composer_package_name: ${{ matrix.composer_package_name }} - pytest-wandb-entity: "mosaicml-public-integration-tests" - pytest-wandb-project: "integration-tests-${{ github.sha }}" safe_directory: composer secrets: aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - wandb-api-key: ${{ secrets.WANDB_API_KEY }} code-eval-device: ${{ secrets.CODE_EVAL_DEVICE }} code-eval-url: ${{ secrets.CODE_EVAL_URL }} code-eval-apikey: ${{ secrets.CODE_EVAL_APIKEY }} diff --git a/composer/loggers/wandb_logger.py b/composer/loggers/wandb_logger.py index 4a2afe5c84..d76ee1fbac 100644 --- a/composer/loggers/wandb_logger.py +++ b/composer/loggers/wandb_logger.py @@ -200,7 +200,11 @@ def init(self, state: State, logger: Logger) -> None: if self._enabled: wandb.init(**self._init_kwargs) assert wandb.run is not None, 'The wandb run is set after init' - entity_and_project = [str(wandb.run.entity), str(wandb.run.project)] + if hasattr(wandb.run, 'entity') and hasattr(wandb.run, 'project'): + entity_and_project = [str(wandb.run.entity), str(wandb.run.project)] + else: + # Run does not have attribtues if wandb is in disabled mode, so we must mock it + entity_and_project = ['disabled', 'disabled'] self.run_dir = wandb.run.dir self.run_url = wandb.run.get_url() atexit.register(self._set_is_in_atexit) From f6c00b8292c34280fa1297a14ef1c3cee5f12e22 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Thu, 8 Aug 2024 12:42:24 -0700 Subject: [PATCH 05/12] Fix FSDP Config Validation (#3530) --- composer/core/state.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/composer/core/state.py b/composer/core/state.py index ca20dd1011..cbd7fc41db 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -641,12 +641,12 @@ def _validate_parallelism_configs(self): raise ValueError(error_message) # Validate FSDP config parameters. - if self.fsdp_config and self.fsdp_config.activation_cpu_offload and not self.fsdp_config.use_orig_params: + if self.fsdp_config is not None and self.fsdp_config.activation_cpu_offload and not self.fsdp_config.use_orig_params: raise ValueError('activation_cpu_offload=True is not supported with use_orig_params=False.') # Validate FSDP state dict type - if self.fsdp_state_dict_type not in [None, 'full', 'sharded']: - if self.fsdp_state_dict_type == 'local': + if self.fsdp_config is not None and self.fsdp_config.state_dict_type not in [None, 'full', 'sharded']: + if self.fsdp_config.state_dict_type == 'local': raise ValueError( 'Composer and PyTorch no longer support saving or loading local state dicts. ' 'To upgrade an older checkpoint, use Composer version 0.18.1 and export as ' @@ -654,7 +654,7 @@ def _validate_parallelism_configs(self): ) raise ValueError( f'fsdp_state_dict_type must be one of [None, "full", "sharded"], but got ' - f'{self.fsdp_state_dict_type}', + f'{self.fsdp_config.state_dict_type}', ) if self.fsdp_sharded_state_dict_enabled and self.save_metrics: # Sharded state dict breaks in many different ways with torchmetrics, due to both sharding From 14f5445f94ab3b558d890339fe2fe5e57d41dcca Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Thu, 8 Aug 2024 13:18:17 -0700 Subject: [PATCH 06/12] Remove dev on github workflows (#3536) --- .github/PULL_REQUEST_TEMPLATE.md | 6 +++--- .github/workflows/code-quality.yaml | 1 - .github/workflows/daily.yaml | 1 - .github/workflows/pr-docker.yaml | 1 - .github/workflows/smoketest.yaml | 1 - 5 files changed, 3 insertions(+), 7 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 8b249a5ccf..a5e905ebc7 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -16,13 +16,13 @@ Example: --> # Before submitting -- [ ] Have you read the [contributor guidelines](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md)? +- [ ] Have you read the [contributor guidelines](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md)? - [ ] Is this change a documentation change or typo fix? If so, skip the rest of this checklist. - [ ] Was this change discussed/approved in a GitHub issue first? It is much more likely to be merged if so. - [ ] Did you update any related docs and document your change? -- [ ] Did you update any related tests and add any new tests related to your change? (see [testing](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md#running-tests)) +- [ ] Did you update any related tests and add any new tests related to your change? (see [testing](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md#running-tests)) - [ ] Did you run the tests locally to make sure they pass? -- [ ] Did you run `pre-commit` on your change? (see the `pre-commit` section of [prerequisites](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md#prerequisites)) +- [ ] Did you run `pre-commit` on your change? (see the `pre-commit` section of [prerequisites](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md#prerequisites)) | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.3.1 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.4.0 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.4.0 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.4.0 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.3.1 | cpu | 3.11 | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.2.2 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.2.2 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.2.2 | cpu | 3.11 | `mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.1 (Infiniband) | 3.10 | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.1 (EFA) | 3.10 | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws` diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index ee74d12309..2fb084a78b 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -1,79 +1,53 @@ # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT! - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-3-1-cu121 + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 + CUDA_VERSION: 12.4.1 + IMAGE_NAME: torch-2-4-0-cu124 MOFED_VERSION: latest-23.10 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 - mosaicml/pytorch:latest TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: v1.9.1-aws - BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-3-1-cu121-aws + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 + CUDA_VERSION: 12.4.1 + IMAGE_NAME: torch-2-4-0-cu124-aws MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws - mosaicml/pytorch:latest-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-3-1-cpu + IMAGE_NAME: torch-2-4-0-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 - mosaicml/pytorch:latest_cpu TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-2-2-cu121 + IMAGE_NAME: torch-2-3-1-cu121 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -92,15 +66,15 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.2.2 + PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.17.2 + TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: v1.9.1-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-2-2-cu121-aws + IMAGE_NAME: torch-2-3-1-cu121-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -119,29 +93,29 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.2.2 + PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.17.2 + TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-2-2-cpu + IMAGE_NAME: torch-2-3-1-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.2.2 + PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.17.2 + TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-1-2-cu121 + IMAGE_NAME: torch-2-2-2-cu121 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -157,18 +131,18 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 + PYTORCH_VERSION: 2.2.2 TAGS: - - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 + TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: v1.9.1-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-1-2-cu121-aws + IMAGE_NAME: torch-2-2-2-cu121-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -184,57 +158,44 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 + PYTORCH_VERSION: 2.2.2 TAGS: - - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws + - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 + TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-1-2-cpu + IMAGE_NAME: torch-2-2-2-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 + PYTORCH_VERSION: 2.2.2 TAGS: - - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 + TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5 - CUDA_VERSION: 12.1.1 + CUDA_VERSION: 12.4.1 IMAGE_NAME: composer-0-23-5 MOFED_VERSION: latest-23.10 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.0 TAGS: - mosaicml/composer:0.23.5 - mosaicml/composer:latest TARGET: composer_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5 @@ -245,9 +206,9 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.0 TAGS: - mosaicml/composer:0.23.5_cpu - mosaicml/composer:latest_cpu TARGET: composer_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.0 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 74d9c7fed4..a1cf5bca3b 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -19,22 +19,24 @@ import yaml PRODUCTION_PYTHON_VERSION = '3.11' -PRODUCTION_PYTORCH_VERSION = '2.3.1' +PRODUCTION_PYTORCH_VERSION = '2.4.0' def _get_torchvision_version(pytorch_version: str): + if pytorch_version == '2.4.0': + return '0.19.0' if pytorch_version == '2.3.1': return '0.18.1' if pytorch_version == '2.2.2': return '0.17.2' - if pytorch_version == '2.1.2': - return '0.16.2' raise ValueError(f'Invalid pytorch_version: {pytorch_version}') def _get_base_image(cuda_version: str): if not cuda_version: return 'ubuntu:20.04' + if cuda_version == '12.4.1': + return f'nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04' return f'nvidia/cuda:{cuda_version}-cudnn8-devel-ubuntu20.04' @@ -42,12 +44,12 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool): # From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/ if not use_cuda: return '' + if pytorch_version == '2.4.0': + return '12.4.1' if pytorch_version == '2.3.1': return '12.1.1' if pytorch_version == '2.2.2': return '12.1.1' - if pytorch_version == '2.1.2': - return '12.1.1' raise ValueError(f'Invalid pytorch_version: {pytorch_version}') @@ -167,7 +169,7 @@ def _write_table(table_tag: str, table_contents: str): def _main(): - python_pytorch_versions = [('3.11', '2.3.1'), ('3.11', '2.2.2'), ('3.10', '2.1.2')] + python_pytorch_versions = [('3.11', '2.4.0'), ('3.11', '2.3.1'), ('3.11', '2.2.2')] cuda_options = [True, False] stages = ['pytorch_stage'] interconnects = ['mellanox', 'EFA'] # mellanox is default, EFA needed for AWS diff --git a/setup.py b/setup.py index 11c82b5a37..4dbf584c32 100644 --- a/setup.py +++ b/setup.py @@ -80,8 +80,8 @@ def package_files(prefix: str, directory: str, extension: str): 'tqdm>=4.62.3,<5', 'torchmetrics>=1.4.0.post0,<1.4.1', 'torch_optimizer>=0.3.0,<0.4', - 'torchvision>=0.13.1,<0.18.2', - 'torch>=2.1.2,<2.3.2', + 'torchvision>=0.14.0,<0.19.1', + 'torch>=2.2.0,<2.4.1', 'requests>=2.26.0,<3', 'numpy>=1.21.5,<2.1.0', 'psutil>=5.8.0,<7', diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py index a59e60172a..5bdf76ce8a 100644 --- a/tests/trainer/test_fsdp_checkpoint.py +++ b/tests/trainer/test_fsdp_checkpoint.py @@ -315,12 +315,10 @@ def test_fsdp_full_state_dict_load( use_tp: bool, use_hsdp: bool, ): - if use_hsdp: - pytest.xfail('Known PyTorch issue with HSDP, waiting for pytorch patch') + if use_hsdp and version.parse(torch.__version__) < version.parse('2.4.0'): + pytest.xfail('HSDP requires torch 2.4.0 or later') if use_tp: pytest.skip('TP on PyTorch 2.3 has full state dict issues.') - if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'): - pytest.skip('HSDP and TP require torch 2.3.0 or later') if autoresume: run_name = 'my-cool-autoresume-run' else: @@ -1153,7 +1151,10 @@ def set_up_planner( # suffix all keys with `foo_`` state_dict['state']['model'] = {k + '_foo': v for k, v in state_dict['state']['model'].items()} - super().set_up_planner(state_dict, is_coordinator) + super().set_up_planner( + state_dict=state_dict, + is_coordinator=is_coordinator, + ) class RenameLoadPlanner(DefaultLoadPlanner): @@ -1164,7 +1165,11 @@ def set_up_planner( is_coordinator: bool, ) -> None: if 'state' not in state_dict: - super().set_up_planner(state_dict, metadata, is_coordinator) + super().set_up_planner( + state_dict=state_dict, + metadata=metadata, + is_coordinator=is_coordinator, + ) return self.original_state_dict = state_dict diff --git a/tests/utils/test_inference.py b/tests/utils/test_inference.py index e7c374377d..69b78ead4c 100644 --- a/tests/utils/test_inference.py +++ b/tests/utils/test_inference.py @@ -196,7 +196,7 @@ def test_huggingface_export_for_inference_onnx(onnx_opset_version, tiny_bert_con ort_session = ort.InferenceSession(save_path, providers=['CPUExecutionProvider']) for key, value in sample_input.items(): - sample_input[key] = cpu_device.tensor_to_device(value).numpy() + sample_input[key] = cpu_device.tensor_to_device(value).numpy() # type: ignore loaded_model_out = ort_session.run(None, sample_input) From 6664382d9a2f776ab887f139000a5491d5ec5785 Mon Sep 17 00:00:00 2001 From: Eitan Turok <150733043+eitanturok@users.noreply.github.com> Date: Mon, 12 Aug 2024 18:29:41 -0400 Subject: [PATCH 12/12] Use python 3.11 in GAs (#3529) --- .github/workflows/daily.yaml | 2 +- .github/workflows/pr-gpu.yaml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index ee94e89c2b..5552d6c19c 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -139,7 +139,7 @@ jobs: pip_deps: "[all]" pytest-command: ${{ matrix.pytest_command }} pytest-markers: ${{ matrix.markers }} - python-version: 3.9 + python-version: 3.11 gpu_num: ${{ matrix.gpu_num }} gha-timeout: 5400 secrets: diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index 392a2665c8..2f335a5a68 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -29,7 +29,7 @@ jobs: pip_deps: "[all]" pytest-command: ${{ matrix.pytest_command }} pytest-markers: ${{ matrix.markers }} - python-version: 3.9 + python-version: 3.11 gpu_num: 1 secrets: mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }} @@ -55,7 +55,7 @@ jobs: pip_deps: "[all]" pytest-command: ${{ matrix.pytest_command }} pytest-markers: ${{ matrix.markers }} - python-version: 3.9 + python-version: 3.11 gpu_num: 2 secrets: mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }} @@ -82,7 +82,7 @@ jobs: pip_deps: "[all]" pytest-command: ${{ matrix.pytest_command }} pytest-markers: ${{ matrix.markers }} - python-version: 3.9 + python-version: 3.11 gpu_num: 4 secrets: mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}