From b55855a3d4057d1d70a299d5b0b3630ea79e63e4 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 14 Nov 2023 15:44:30 +0100 Subject: [PATCH 01/24] fix initial typos (#2150) --- docs/source/concept_guides/big_model_inference.md | 2 +- docs/source/concept_guides/gradient_synchronization.md | 4 ++-- docs/source/usage_guides/distributed_inference.md | 2 +- docs/source/usage_guides/explore.md | 2 +- docs/source/usage_guides/megatron_lm.md | 8 ++++---- docs/source/usage_guides/training_zoo.md | 2 +- src/accelerate/commands/config/cluster.py | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/concept_guides/big_model_inference.md b/docs/source/concept_guides/big_model_inference.md index ddce9114cdc..4e09adae686 100644 --- a/docs/source/concept_guides/big_model_inference.md +++ b/docs/source/concept_guides/big_model_inference.md @@ -154,7 +154,7 @@ By passing `device_map="auto"`, we tell 🤗 Accelerate to determine automatical #### `no_split_module_classes` This parameter will indicate that some of the modules with the name `"Block"` should not be split across different devices. You should set here all blocks that -include a residutal connection of some kind. +include a residual connection of some kind. #### The `device_map` diff --git a/docs/source/concept_guides/gradient_synchronization.md b/docs/source/concept_guides/gradient_synchronization.md index 9010628ef7f..7ae8ab6853f 100644 --- a/docs/source/concept_guides/gradient_synchronization.md +++ b/docs/source/concept_guides/gradient_synchronization.md @@ -55,8 +55,8 @@ their gradients computed, collated, and updated before moving on to the next batch of data. When performing gradient accumulation, you accumulate `n` loss gradients and skip `optimizer.step()` until `n` batches have been reached. As all training -processes only need to sychronize by the time `optimizer.step()` is called, -without any modification to your training step, this neededless inter-process +processes only need to synchronize by the time `optimizer.step()` is called, +without any modification to your training step, this needless inter-process communication can cause a significant slowdown. How can you avoid this overhead? diff --git a/docs/source/usage_guides/distributed_inference.md b/docs/source/usage_guides/distributed_inference.md index 3bdd7121401..41053658482 100644 --- a/docs/source/usage_guides/distributed_inference.md +++ b/docs/source/usage_guides/distributed_inference.md @@ -51,7 +51,7 @@ def run_inference(rank, world_size): One will notice how we have to check the rank to know what prompt to send, which can be a bit tedious. A user might then also think that with 🤗 Accelerate, using the `Accelerator` to prepare a dataloader for such a task might also be -a simple way to manage this. (To learn more, check out the relvent section in the [Quick Tour](../quicktour#distributed-evaluation)) +a simple way to manage this. (To learn more, check out the relevant section in the [Quick Tour](../quicktour#distributed-evaluation)) Can it manage it? Yes. Does it add unneeded extra code however: also yes. diff --git a/docs/source/usage_guides/explore.md b/docs/source/usage_guides/explore.md index 2b4decefa2a..533c4cf444f 100644 --- a/docs/source/usage_guides/explore.md +++ b/docs/source/usage_guides/explore.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # Learning how to incorporate 🤗 Accelerate features quickly! Please use the interactive tool below to help you get started with learning about a particular -feature of 🤗 Accelerate and how to utilize it! It will provide you with a code diff, an explaination +feature of 🤗 Accelerate and how to utilize it! It will provide you with a code diff, an explanation towards what is going on, as well as provide you with some useful links to explore more within the documentation! diff --git a/docs/source/usage_guides/megatron_lm.md b/docs/source/usage_guides/megatron_lm.md index 7b6822086da..25bea1f58d2 100644 --- a/docs/source/usage_guides/megatron_lm.md +++ b/docs/source/usage_guides/megatron_lm.md @@ -128,7 +128,7 @@ Do you want to enable Sequence Parallelism? [YES/no]: What is the Pipeline Parallelism degree/size? [1]:2 What is the number of micro-batches? [1]:2 Do you want to enable selective activation recomputation? [YES/no]: -Do you want to use distributed optimizer which shards optimizer state and gradients across data pralellel ranks? [YES/no]: +Do you want to use distributed optimizer which shards optimizer state and gradients across data parallel ranks? [YES/no]: What is the gradient clipping value based on global L2 Norm (0 to disable)? [1.0]: How many GPU(s) should be used for distributed training? [1]:4 Do you wish to use FP16 or BF16 (mixed precision)? [NO/fp16/bf16]: bf16 @@ -355,8 +355,8 @@ def main(): 2. For using the Megatron-LM datasets, a few more changes are required. Dataloaders for these datasets are available only on rank 0 of each tensor parallel group. As such, there are rank where dataloader won't be -avaiable and this requires tweaks to the training loop. Being able to do all this shows how -felixble and extensible 🤗 Accelerate is. The changes required are as follows. +available and this requires tweaks to the training loop. Being able to do all this shows how +flexible and extensible 🤗 Accelerate is. The changes required are as follows. a. For Megatron-LM indexed datasets, we need to use `MegatronLMDummyDataLoader` and pass the required dataset args to it such as `data_path`, `seq_length` etc. @@ -547,7 +547,7 @@ The `model(**batch_data)` call return loss(es) averaged across the data parallel This is fine for most cases wherein pre-training jobs are run using Megatron-LM features and you can easily compute the `perplexity` using the loss. For GPT model, returning logits in addition to loss(es) is supported. -These logits aren't gathered across data prallel ranks. Use `accelerator.utils.gather_across_data_parallel_groups` +These logits aren't gathered across data parallel ranks. Use `accelerator.utils.gather_across_data_parallel_groups` to gather logits across data parallel ranks. These logits along with labels can be used for computing various performance metrics. diff --git a/docs/source/usage_guides/training_zoo.md b/docs/source/usage_guides/training_zoo.md index 42dfe18a9f3..2a7f51d2873 100644 --- a/docs/source/usage_guides/training_zoo.md +++ b/docs/source/usage_guides/training_zoo.md @@ -15,7 +15,7 @@ rendered properly in your Markdown viewer. # Example Zoo -Below contains a non-exhuastive list of tutorials and scripts showcasing 🤗 Accelerate +Below contains a non-exhaustive list of tutorials and scripts showcasing 🤗 Accelerate ## Official Accelerate Examples: diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py index 1090d17ddc3..1331e7fe43c 100644 --- a/src/accelerate/commands/config/cluster.py +++ b/src/accelerate/commands/config/cluster.py @@ -451,7 +451,7 @@ def get_cluster_input(): megatron_lm_config[prefix + "use_distributed_optimizer"] = _ask_field( "Do you want to use distributed optimizer " - "which shards optimizer state and gradients across data pralellel ranks? [YES/no]: ", + "which shards optimizer state and gradients across data parallel ranks? [YES/no]: ", _convert_yes_no_to_bool, default=True, error_message="Please enter yes or no.", From 8dedb140ef8995b4ff6f4b0e2452369a0ab1a969 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Tue, 14 Nov 2023 11:53:57 -0500 Subject: [PATCH 02/24] Add note about GradientState being in-sync with the dataloader by default (#2134) * NOte about sync * PR review comments --- .../usage_guides/gradient_accumulation.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/source/usage_guides/gradient_accumulation.md b/docs/source/usage_guides/gradient_accumulation.md index 54863015d8b..7960e6b0e4c 100644 --- a/docs/source/usage_guides/gradient_accumulation.md +++ b/docs/source/usage_guides/gradient_accumulation.md @@ -118,8 +118,24 @@ You can remove all the special checks for the step number and the loss adjustmen As you can see the [`Accelerator`] is able to keep track of the batch number you are on and it will automatically know whether to step through the prepared optimizer and how to adjust the loss. + Typically with gradient accumulation, you would need to adjust the number of steps to reflect the change in total batches you are -training on. 🤗 Accelerate automagically does this for you by default. Behind the scenes we instantiate a GradientAccumulationPlugin configured to do this. +training on. 🤗 Accelerate automagically does this for you by default. Behind the scenes we instantiate a [`GradientAccumulationPlugin`] configured to do this. + + + + + +The [`state.GradientState`] is sync'd with the active dataloader being iterated upon. As such it assumes naively that when we have reached the end of the dataloader everything will sync and a step will be performed. To disable this, set `sync_with_dataloader` to be `False` in the [`GradientAccumulationPlugin`]: + +```{python} +from accelerate import Accelerator +from accelerate.utils import GradientAccumulationPlugin + +plugin = GradientAccumulationPlugin(sync_with_dataloader=False) +accelerator = Accelerator(..., gradient_accumulation_plugin=plugin) +``` + ## The finished code From e9fd72a61365d8a5c41e3b7260ea4ec2d0a23053 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Tue, 14 Nov 2023 14:42:01 -0500 Subject: [PATCH 03/24] Deprecated stuff (#2152) --- .github/workflows/build-docker-images-release.yml | 2 +- .github/workflows/quality.yml | 2 +- .github/workflows/stale.yml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-docker-images-release.yml b/.github/workflows/build-docker-images-release.yml index 2d3a8a6fbfb..8d5f2194b78 100644 --- a/.github/workflows/build-docker-images-release.yml +++ b/.github/workflows/build-docker-images-release.yml @@ -15,7 +15,7 @@ jobs: outputs: version: ${{ steps.step1.outputs.version }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v3.1.0 - id: step1 run: echo "version=$(python setup.py --version)" >> $GITHUB_OUTPUT diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 0060c7635f7..ac1a463b0bd 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -6,7 +6,7 @@ jobs: quality: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3.1.0 - name: Set up Python 3.8 uses: actions/setup-python@v3 with: diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index f79ecf7a3bd..39ca1384783 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -13,10 +13,10 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3.1.0 - name: Setup Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v3 with: python-version: 3.8 From a912b2ee095910c5df58dd426ac612ced1e5b173 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Tue, 14 Nov 2023 15:03:41 -0500 Subject: [PATCH 04/24] Add examples to tests (#2131) * Add examples to tests * Try now * Right name * Right path * Fin * Too slow, just test on runner --- .github/workflows/self_hosted_integration_tests.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml index 7b4d8f6b813..dc8b49cd38f 100644 --- a/.github/workflows/self_hosted_integration_tests.yml +++ b/.github/workflows/self_hosted_integration_tests.yml @@ -81,6 +81,16 @@ jobs: source activate accelerate; pytest -sv tests/deepspeed + - name: Run transformers examples tests + working-directory: transformers/examples/pytorch + env: + CUDA_VISIBLE_DEVICES: ${{ matrix.cuda_visible_devices }} + WANDB_DISABLED: true + run: | + pip install -r _tests_requirements.txt + cd ../../ + pytest -sv examples/pytorch/test_accelerate_examples.py examples/pytorch/test_pytorch_examples.py + run-skorch-tests: container: image: huggingface/accelerate-gpu:latest From 0f2686c8d3e6d949c4b7efa15d7f2dee44f7ce91 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Wed, 15 Nov 2023 11:29:39 -0500 Subject: [PATCH 05/24] Disable pypi for merge workflows + fix trainer tests (#2153) * Disable workflows for PR + merge * skorch * Fix transformers tests too --- .github/workflows/integration_tests.yml | 8 ------- .../self_hosted_integration_tests.yml | 24 ++++--------------- 2 files changed, 5 insertions(+), 27 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index e7bff0d60ca..68085ff9b21 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -25,11 +25,6 @@ jobs: runs-on: ubuntu-latest strategy: fail-fast: false - matrix: - transformers-version: [ - pypi, - github - ] steps: - uses: actions/checkout@v3.1.0 - name: Set up python 3.8 @@ -47,9 +42,6 @@ jobs: cd .. git clone https://github.com/huggingface/transformers cd transformers - if [[ ${{ matrix.transformers-version }} = pypi ]]; then - git checkout $(git describe --tags `git rev-list --tags --max-count=1`) - fi pip install .[torch,testing] - name: Show installed libraries diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml index dc8b49cd38f..94e50e61ff3 100644 --- a/.github/workflows/self_hosted_integration_tests.yml +++ b/.github/workflows/self_hosted_integration_tests.yml @@ -29,10 +29,6 @@ jobs: strategy: fail-fast: false matrix: - transformers-version: [ - pypi, - github - ] cuda_visible_devices: [ "0", "0,1" @@ -51,11 +47,9 @@ jobs: run: | source activate accelerate git config --global --add safe.directory '*' - git checkout main && git pull && git fetch --tags - if [[ ${{ matrix.transformers-version }} = pypi ]]; then - git checkout $(git tag --sort=taggerdate | tail -1) - fi + git checkout main && git pull pip install .[torch,deepspeed-testing] + pip uninstall comet_ml wandb -y - name: Show installed libraries run: | @@ -82,13 +76,13 @@ jobs: pytest -sv tests/deepspeed - name: Run transformers examples tests - working-directory: transformers/examples/pytorch + working-directory: transformers/ env: CUDA_VISIBLE_DEVICES: ${{ matrix.cuda_visible_devices }} WANDB_DISABLED: true run: | - pip install -r _tests_requirements.txt - cd ../../ + source activate accelerate + pip install -r examples/pytorch/_tests_requirements.txt pytest -sv examples/pytorch/test_accelerate_examples.py examples/pytorch/test_pytorch_examples.py run-skorch-tests: @@ -98,11 +92,6 @@ jobs: runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] strategy: fail-fast: false - matrix: - skorch-version: [ - pypi, - github - ] steps: - name: Update accelerate clone and pip install working-directory: accelerate/ @@ -118,9 +107,6 @@ jobs: source activate accelerate git config --global --add safe.directory '*' git checkout master && git pull - if [[ ${{ matrix.skorch-version }} = pypi ]]; then - git checkout $(git describe --tags `git rev-list --tags --max-count=1`) - fi pip install .[testing] pip install flaky From 99877f56d6d77f38f031d0bfac40c0d2a409f5b8 Mon Sep 17 00:00:00 2001 From: Dave Berenbaum Date: Fri, 17 Nov 2023 08:49:13 -0500 Subject: [PATCH 06/24] Adds dvclive tracker (#2139) * dvclive tracker * add dvclive to test_trackers * fix dvclive tests * add dvclive example and respond to other feedback * fix dvclive tests * fix quality --- docs/source/usage_guides/tracking.md | 3 +- .../deepspeed_with_config_support.py | 2 +- .../by_feature/megatron_lm_gpt_pretraining.py | 2 +- setup.py | 2 +- src/accelerate/test_utils/testing.py | 8 ++ src/accelerate/tracking.py | 79 +++++++++++++++++++ src/accelerate/utils/__init__.py | 1 + src/accelerate/utils/dataclasses.py | 2 + src/accelerate/utils/imports.py | 4 + tests/test_examples.py | 2 +- tests/test_tracking.py | 51 +++++++++++- 11 files changed, 150 insertions(+), 6 deletions(-) diff --git a/docs/source/usage_guides/tracking.md b/docs/source/usage_guides/tracking.md index 141fea6924b..dba4b084d5d 100644 --- a/docs/source/usage_guides/tracking.md +++ b/docs/source/usage_guides/tracking.md @@ -20,7 +20,7 @@ There are a large number of experiment tracking API's available, however getting ## Integrated Trackers -Currently `Accelerate` supports six trackers out-of-the-box: +Currently `Accelerate` supports seven trackers out-of-the-box: - TensorBoard - WandB @@ -28,6 +28,7 @@ Currently `Accelerate` supports six trackers out-of-the-box: - Aim - MLFlow - ClearML +- DVCLive To use any of them, pass in the selected type(s) to the `log_with` parameter in [`Accelerate`]: ```python diff --git a/examples/by_feature/deepspeed_with_config_support.py b/examples/by_feature/deepspeed_with_config_support.py index 15e810c4a2e..b5f122f3ad1 100755 --- a/examples/by_feature/deepspeed_with_config_support.py +++ b/examples/by_feature/deepspeed_with_config_support.py @@ -220,7 +220,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"`, and `"dvclive"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) diff --git a/examples/by_feature/megatron_lm_gpt_pretraining.py b/examples/by_feature/megatron_lm_gpt_pretraining.py index 3c048b2600e..b0e1b33700f 100644 --- a/examples/by_feature/megatron_lm_gpt_pretraining.py +++ b/examples/by_feature/megatron_lm_gpt_pretraining.py @@ -216,7 +216,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"`, and `"dvclive"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) diff --git a/setup.py b/setup.py index f6eefda0dea..b3a8fda47bf 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ extras["testing"] = extras["test_prod"] + extras["test_dev"] extras["rich"] = ["rich"] -extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard"] +extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard", "dvclive"] extras["dev"] = extras["quality"] + extras["testing"] + extras["rich"] extras["sagemaker"] = [ diff --git a/src/accelerate/test_utils/testing.py b/src/accelerate/test_utils/testing.py index d6d1e2f2f0a..8a8b82f4e34 100644 --- a/src/accelerate/test_utils/testing.py +++ b/src/accelerate/test_utils/testing.py @@ -35,6 +35,7 @@ is_comet_ml_available, is_datasets_available, is_deepspeed_available, + is_dvclive_available, is_mps_available, is_pandas_available, is_tensorboard_available, @@ -231,6 +232,13 @@ def require_clearml(test_case): return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case) +def require_dvclive(test_case): + """ + Decorator marking a test that requires dvclive installed. These tests are skipped when dvclive isn't installed + """ + return unittest.skipUnless(is_dvclive_available(), "test requires dvclive")(test_case) + + def require_pandas(test_case): """ Decorator marking a test that requires pandas installed. These tests are skipped when pandas isn't installed diff --git a/src/accelerate/tracking.py b/src/accelerate/tracking.py index 4f536d57812..711f616b73e 100644 --- a/src/accelerate/tracking.py +++ b/src/accelerate/tracking.py @@ -30,6 +30,7 @@ is_aim_available, is_clearml_available, is_comet_ml_available, + is_dvclive_available, is_mlflow_available, is_tensorboard_available, is_wandb_available, @@ -57,6 +58,9 @@ if is_clearml_available(): _available_trackers.append(LoggerType.CLEARML) +if is_dvclive_available(): + _available_trackers.append(LoggerType.DVCLIVE) + logger = get_logger(__name__) @@ -837,6 +841,79 @@ def _get_title_series(name): return name, "train" +class DVCLiveTracker(GeneralTracker): + """ + A `Tracker` class that supports `dvclive`. Should be initialized at the start of your script. + + Args: + run_name (`str`, *optional*): + Ignored for dvclive. See `kwargs` instead. + kwargs: + Additional key word arguments passed along to [`dvclive.Live()`](https://dvc.org/doc/dvclive/live). + + Example: + + ```py + from accelerate import Accelerator + + accelerator = Accelerator(log_with="dvclive") + accelerator.init_trackers(project_name="my_project", init_kwargs={"dvclive": {"dir": "my_directory"}}) + ``` + """ + + name = "dvclive" + requires_logging_directory = False + + @on_main_process + def __init__(self, run_name: Optional[str] = None, live: Optional[Any] = None, **kwargs): + from dvclive import Live + + super().__init__() + self.live = live if live is not None else Live(**kwargs) + + @property + def tracker(self): + return self.live + + @on_main_process + def store_init_configuration(self, values: dict): + """ + Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment. Stores the + hyperparameters in a yaml file for future use. + + Args: + values (Dictionary `str` to `bool`, `str`, `float`, `int`, or a List or Dict of those types): + Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`, + `str`, `float`, or `int`. + """ + self.live.log_params(values) + + @on_main_process + def log(self, values: dict, step: Optional[int] = None, **kwargs): + """ + Logs `values` to the current run. + + Args: + values (Dictionary `str` to `str`, `float`, or `int`): + Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`. + step (`int`, *optional*): + The run step. If included, the log will be affiliated with this step. + kwargs: + Additional key word arguments passed along to `dvclive.Live.log_metric()`. + """ + if step is not None: + self.live.step = step + for k, v in values.items(): + self.live.log_metric(k, v, **kwargs) + + @on_main_process + def finish(self): + """ + Closes `dvclive.Live()`. + """ + self.live.end() + + LOGGER_TYPE_TO_CLASS = { "aim": AimTracker, "comet_ml": CometMLTracker, @@ -844,6 +921,7 @@ def _get_title_series(name): "tensorboard": TensorBoardTracker, "wandb": WandBTracker, "clearml": ClearMLTracker, + "dvclive": DVCLiveTracker, } @@ -866,6 +944,7 @@ def filter_trackers( - `"wandb"` - `"comet_ml"` - `"mlflow"` + - `"dvclive"` If `"all"` is selected, will pick up all available trackers in the environment and initialize them. Can also accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`. logging_dir (`str`, `os.PathLike`, *optional*): diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py index 88cc927f001..fa15b173ed3 100644 --- a/src/accelerate/utils/__init__.py +++ b/src/accelerate/utils/__init__.py @@ -52,6 +52,7 @@ is_cuda_available, is_datasets_available, is_deepspeed_available, + is_dvclive_available, is_fp8_available, is_ipex_available, is_megatron_lm_available, diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py index 72f3c9aeb2d..e0e41568b0c 100644 --- a/src/accelerate/utils/dataclasses.py +++ b/src/accelerate/utils/dataclasses.py @@ -340,6 +340,7 @@ class LoggerType(BaseEnum): - **TENSORBOARD** -- TensorBoard as an experiment tracker - **WANDB** -- wandb as an experiment tracker - **COMETML** -- comet_ml as an experiment tracker + - **DVCLIVE** -- dvclive as an experiment tracker """ ALL = "all" @@ -349,6 +350,7 @@ class LoggerType(BaseEnum): COMETML = "comet_ml" MLFLOW = "mlflow" CLEARML = "clearml" + DVCLIVE = "dvclive" class PrecisionType(BaseEnum): diff --git a/src/accelerate/utils/imports.py b/src/accelerate/utils/imports.py index 9a60233c96c..27389eab107 100644 --- a/src/accelerate/utils/imports.py +++ b/src/accelerate/utils/imports.py @@ -297,3 +297,7 @@ def is_xpu_available(check_device=False): except RuntimeError: return False return hasattr(torch, "xpu") and torch.xpu.is_available() + + +def is_dvclive_available(): + return _is_package_available("dvclive") diff --git a/tests/test_examples.py b/tests/test_examples.py index 0426fc645b6..4b697e12132 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -205,7 +205,7 @@ def test_multi_process_metrics(self): run_command(self._launch_args + testargs) @require_trackers - @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"}) + @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"}) def test_tracking(self): with tempfile.TemporaryDirectory() as tmpdir: testargs = f""" diff --git a/tests/test_tracking.py b/tests/test_tracking.py index 545b51fefd4..58709546ea5 100644 --- a/tests/test_tracking.py +++ b/tests/test_tracking.py @@ -35,13 +35,19 @@ TempDirTestCase, require_clearml, require_comet_ml, + require_dvclive, require_pandas, require_tensorboard, require_wandb, skip, ) from accelerate.tracking import CometMLTracker, GeneralTracker -from accelerate.utils import ProjectConfiguration, is_comet_ml_available, is_tensorboard_available +from accelerate.utils import ( + ProjectConfiguration, + is_comet_ml_available, + is_dvclive_available, + is_tensorboard_available, +) if is_comet_ml_available(): @@ -52,6 +58,11 @@ import tensorboard.compat.proto.event_pb2 as event_pb2 +if is_dvclive_available(): + from dvclive.plots.metric import Metric + from dvclive.serialize import load_yaml + from dvclive.utils import parse_metrics + logger = logging.getLogger(__name__) @@ -473,3 +484,41 @@ def test_log(self): "some_string": "", } self.assertDictEqual(data, truth) + + +@require_dvclive +@mock.patch("dvclive.live.get_dvc_repo", return_value=None) +class DVCLiveTrackingTest(unittest.TestCase): + def test_init_trackers(self, mock_repo): + project_name = "test_project_with_config" + with tempfile.TemporaryDirectory() as dirpath: + accelerator = Accelerator(log_with="dvclive") + config = { + "num_iterations": 12, + "learning_rate": 1e-2, + "some_boolean": False, + "some_string": "some_value", + } + init_kwargs = {"dvclive": {"dir": dirpath, "save_dvc_exp": False, "dvcyaml": None}} + accelerator.init_trackers(project_name, config, init_kwargs) + accelerator.end_training() + live = accelerator.trackers[0].live + params = load_yaml(live.params_file) + assert params == config + + def test_log(self, mock_repo): + project_name = "test_project_with_log" + with tempfile.TemporaryDirectory() as dirpath: + accelerator = Accelerator(log_with="dvclive", project_dir=dirpath) + init_kwargs = {"dvclive": {"dir": dirpath, "save_dvc_exp": False, "dvcyaml": None}} + accelerator.init_trackers(project_name, init_kwargs=init_kwargs) + values = {"total_loss": 0.1, "iteration": 1, "my_text": "some_value"} + accelerator.log(values, step=0) + accelerator.end_training() + live = accelerator.trackers[0].live + logs, latest = parse_metrics(live) + assert latest == values + scalars = os.path.join(live.plots_dir, Metric.subfolder) + assert os.path.join(scalars, "total_loss.tsv") in logs + assert os.path.join(scalars, "iteration.tsv") in logs + assert os.path.join(scalars, "my_text.tsv") in logs From cf745c936d541c538f2d1dfda12b2d5c0a2715f8 Mon Sep 17 00:00:00 2001 From: Jingru Date: Fri, 17 Nov 2023 22:00:55 +0800 Subject: [PATCH 07/24] check port availability only in main deepspeed/torchrun launcher (#2078) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * check port availability only in main deepspeed launcher * check port availability only in main launcher for deepspeed/torchrun * Update launch.py add comments --------- Co-authored-by: 聂靖入 --- src/accelerate/utils/launch.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py index 133d55364ad..a299343d90b 100644 --- a/src/accelerate/utils/launch.py +++ b/src/accelerate/utils/launch.py @@ -128,7 +128,10 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> Dict[str, str]: if main_process_port is None: main_process_port = 29500 - if is_port_in_use(main_process_port): + # only need to check port availability in main process, in case we have to start multiple launchers on the same machine + # for some reasons like splitting log files. + need_port_check = num_machines <= 1 or int(args.machine_rank) == 0 + if need_port_check and is_port_in_use(main_process_port): raise ConnectionError( f"Tried to launch distributed communication on port `{main_process_port}`, but another process is utilizing it. " "Please specify a different port (such as using the `----main_process_port` flag or specifying a different `main_process_port` in your config file)" @@ -272,7 +275,10 @@ def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict if main_process_port is None: main_process_port = 29500 - if is_port_in_use(main_process_port): + # only need to check port availability in main process, in case we have to start multiple launchers on the same machine + # for some reasons like splitting log files. + need_port_check = num_machines <= 1 or int(args.machine_rank) == 0 + if need_port_check and is_port_in_use(main_process_port): raise ConnectionError( f"Tried to launch distributed communication on port `{main_process_port}`, but another process is utilizing it. " "Please specify a different port (such as using the `----main_process_port` flag or specifying a different `main_process_port` in your config file)" From a5a7c039a05424a5715f7f829210245fe3165812 Mon Sep 17 00:00:00 2001 From: Frankie Robertson Date: Fri, 17 Nov 2023 16:01:35 +0200 Subject: [PATCH 08/24] Do not attempt to pad nested tensors (#2041) --- src/accelerate/utils/__init__.py | 1 + src/accelerate/utils/operations.py | 11 +++++++++++ tests/test_utils.py | 11 +++++++++++ 3 files changed, 23 insertions(+) diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py index fa15b173ed3..497c14abe28 100644 --- a/src/accelerate/utils/__init__.py +++ b/src/accelerate/utils/__init__.py @@ -102,6 +102,7 @@ save_offload_index, ) from .operations import ( + CannotPadNestedTensorWarning, broadcast, broadcast_object_list, concatenate, diff --git a/src/accelerate/utils/operations.py b/src/accelerate/utils/operations.py index 267f7809ef0..5d1df1d995c 100644 --- a/src/accelerate/utils/operations.py +++ b/src/accelerate/utils/operations.py @@ -17,6 +17,7 @@ """ import pickle +import warnings from functools import update_wrapper, wraps from typing import Any, Mapping @@ -525,6 +526,10 @@ def concatenate(data, dim=0): return torch.cat(data, dim=dim) +class CannotPadNestedTensorWarning(UserWarning): + pass + + @chained_operation def pad_across_processes(tensor, dim=0, pad_index=0, pad_first=False): """ @@ -543,6 +548,12 @@ def pad_across_processes(tensor, dim=0, pad_index=0, pad_first=False): """ def _pad_across_processes(tensor, dim=0, pad_index=0, pad_first=False): + if getattr(tensor, "is_nested", False): + warnings.warn( + "Cannot pad nested tensors without more information. Leaving unprocessed.", + CannotPadNestedTensorWarning, + ) + return tensor if dim >= len(tensor.shape): return tensor diff --git a/tests/test_utils.py b/tests/test_utils.py index fa23e72986d..239214bfc3c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -27,11 +27,13 @@ from accelerate.test_utils.testing import require_cuda, require_torch_min_version from accelerate.test_utils.training import RegressionModel from accelerate.utils import ( + CannotPadNestedTensorWarning, check_os_kernel, convert_outputs_to_fp32, extract_model_from_parallel, find_device, listify, + pad_across_processes, patch_environment, recursively_apply, save, @@ -226,3 +228,12 @@ def forward(self, x): save(model.state_dict(), save_path, safe_serialization=True) self.assertEqual(len(log.records), 1) self.assertIn("Removed shared tensor", log.output[0]) + + @require_torch_min_version(version="1.12") + def test_pad_across_processes(self): + from torch.nested import nested_tensor + + nt = nested_tensor([[1, 2, 3], [1], [1, 2]]) + with self.assertWarns(CannotPadNestedTensorWarning): + nt2 = pad_across_processes(nt) + self.assertIs(nt, nt2) From cd515812483aedd070933c3367366a0b5ef43daa Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Fri, 17 Nov 2023 09:24:20 -0500 Subject: [PATCH 09/24] Add warning for problematic libraries (#2151) * Test bnb and fix nb launcher skip * Fin * Rm comment * PR Review comments * Just star --- src/accelerate/launchers.py | 13 +++++++- .../test_utils/scripts/test_notebook.py | 33 ++++++++++++++----- src/accelerate/test_utils/testing.py | 6 ++-- src/accelerate/utils/__init__.py | 8 ++++- src/accelerate/utils/environment.py | 9 +++++ tests/test_multigpu.py | 23 ++++++------- 6 files changed, 67 insertions(+), 25 deletions(-) diff --git a/src/accelerate/launchers.py b/src/accelerate/launchers.py index 310e52c313d..0e32d84d06d 100644 --- a/src/accelerate/launchers.py +++ b/src/accelerate/launchers.py @@ -19,7 +19,7 @@ import torch from .state import AcceleratorState, PartialState -from .utils import PrecisionType, PrepareForLaunch, is_mps_available, patch_environment +from .utils import PrecisionType, PrepareForLaunch, are_libraries_initialized, is_mps_available, patch_environment def test_launch(): @@ -142,6 +142,17 @@ def train(*args): "inside your training function. Restart your notebook and make sure no cells initializes an " "`Accelerator`." ) + # Check for specific libraries known to initialize CUDA that users constantly use + problematic_imports = are_libraries_initialized("bitsandbytes") + if len(problematic_imports) > 1: + err = ( + "Could not start distributed process. Libraries known to initialize CUDA upon import have been " + "imported already. Please keep these imports inside your training function to try and help with this:" + ) + for lib_name in problematic_imports: + err += f"\n\t* `{lib_name}`" + raise RuntimeError(err) + # torch.distributed will expect a few environment variable to be here. We set the ones common to each # process here (the other ones will be set be the launcher). with patch_environment( diff --git a/src/accelerate/test_utils/scripts/test_notebook.py b/src/accelerate/test_utils/scripts/test_notebook.py index 8f215d8fd19..999fab34cbe 100644 --- a/src/accelerate/test_utils/scripts/test_notebook.py +++ b/src/accelerate/test_utils/scripts/test_notebook.py @@ -1,17 +1,34 @@ # Test file to ensure that in general certain situational setups for notebooks work. -import argparse +import os + +from pytest import raises from accelerate import PartialState, notebook_launcher +from accelerate.test_utils import require_bnb +from accelerate.utils import is_bnb_available + +def basic_function(): + # Just prints the PartialState + print(f"PartialState:\n{PartialState()}") -parser = argparse.ArgumentParser() -parser.add_argument("--num_processes", type=int, default=1) -args = parser.parse_args() +NUM_PROCESSES = os.environ.get("ACCELERATE_NUM_PROCESSES", 1) -def function(): - print(f"PartialState:\n{PartialState()}") + +def test_can_initialize(): + notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES) + + +@require_bnb +def test_problematic_imports(): + with raises(AssertionError, match="Please keep these imports"): + notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES) -if __name__ == "__main__": - notebook_launcher(function, num_processes=int(args.num_processes)) +def main(): + print("Test basic notebook can be ran") + test_can_initialize() + if is_bnb_available(): + print("Test problematic imports (bnb)") + test_problematic_imports() diff --git a/src/accelerate/test_utils/testing.py b/src/accelerate/test_utils/testing.py index 8a8b82f4e34..75725012181 100644 --- a/src/accelerate/test_utils/testing.py +++ b/src/accelerate/test_utils/testing.py @@ -431,13 +431,15 @@ class SubprocessCallException(Exception): pass -def run_command(command: List[str], return_stdout=False): +def run_command(command: List[str], return_stdout=False, env=None): """ Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture if an error occured while running `command` """ + if env is None: + env = os.environ.copy() try: - output = subprocess.check_output(command, stderr=subprocess.STDOUT) + output = subprocess.check_output(command, stderr=subprocess.STDOUT, env=env) if return_stdout: if hasattr(output, "decode"): output = output.decode("utf-8") diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py index 497c14abe28..702d9697acd 100644 --- a/src/accelerate/utils/__init__.py +++ b/src/accelerate/utils/__init__.py @@ -37,7 +37,13 @@ TensorInformation, TorchDynamoPlugin, ) -from .environment import get_int_from_env, parse_choice_from_env, parse_flag_from_env, str_to_bool +from .environment import ( + are_libraries_initialized, + get_int_from_env, + parse_choice_from_env, + parse_flag_from_env, + str_to_bool, +) from .imports import ( get_ccl_version, is_4bit_bnb_available, diff --git a/src/accelerate/utils/environment.py b/src/accelerate/utils/environment.py index 0cd46c2dcaf..cff6e73f380 100644 --- a/src/accelerate/utils/environment.py +++ b/src/accelerate/utils/environment.py @@ -13,6 +13,8 @@ # limitations under the License. import os +import sys +from typing import Dict def str_to_bool(value) -> int: @@ -48,3 +50,10 @@ def parse_flag_from_env(key, default=False): def parse_choice_from_env(key, default="no"): value = os.environ.get(key, str(default)) return value + + +def are_libraries_initialized(*library_names: str) -> Dict[str, bool]: + """ + Checks if any of `library_names` are imported in the environment. Will return results as a `key:bool` pair. + """ + return [lib_name for lib_name in library_names if lib_name in sys.modules] diff --git a/tests/test_multigpu.py b/tests/test_multigpu.py index 73ee6367f5c..a479130b74f 100644 --- a/tests/test_multigpu.py +++ b/tests/test_multigpu.py @@ -21,7 +21,8 @@ import accelerate from accelerate import Accelerator from accelerate.big_modeling import dispatch_model -from accelerate.test_utils import assert_exception, execute_subprocess_async, require_multi_gpu, skip +from accelerate.test_utils import assert_exception, execute_subprocess_async, require_multi_gpu +from accelerate.test_utils.testing import run_command from accelerate.utils import patch_environment @@ -33,6 +34,9 @@ def setUp(self): mod_file.split(os.path.sep)[:-1] + ["scripts", "test_distributed_data_loop.py"] ) self.operation_file_path = os.path.sep.join(mod_file.split(os.path.sep)[:-1] + ["scripts", "test_ops.py"]) + self.notebook_launcher_path = os.path.sep.join( + mod_file.split(os.path.sep)[:-1] + ["scripts", "test_notebook.py"] + ) @require_multi_gpu def test_multi_gpu(self): @@ -66,23 +70,16 @@ def test_distributed_data_loop(self): with patch_environment(omp_num_threads=1, cuda_visible_devices="0,1"): execute_subprocess_async(cmd, env=os.environ.copy()) - # Need to see why this test raises forking issues when ran as a suite - @skip @require_multi_gpu def test_notebook_launcher(self): """ - This test checks that the `notebook_launcher` will be able to intialize - a `PartialState` without issue + This test checks a variety of situations and scenarios + with the `notebook_launcher` """ - cmd = [ - "python", - "-m", - "accelerate.test_utils.scripts.test_notebook", - "--num_processes", - str(torch.cuda.device_count()), - ] + cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", self.notebook_launcher_path] + print(f"Running {cmd}") with patch_environment(omp_num_threads=1): - execute_subprocess_async(cmd, env=os.environ.copy()) + run_command(cmd, env=os.environ.copy()) if __name__ == "__main__": From 62af7372198c8c3e0fbc43c7490ba1a4c015e2be Mon Sep 17 00:00:00 2001 From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com> Date: Mon, 20 Nov 2023 04:24:30 -0800 Subject: [PATCH 10/24] Add ZeRO++ to DeepSpeed usage docs (#2166) * added zeropp to deepspeed doc file * minor edit to clarify hpz size --- docs/source/usage_guides/deepspeed.md | 30 +++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/docs/source/usage_guides/deepspeed.md b/docs/source/usage_guides/deepspeed.md index 6f6350dcb2a..0404136ff3e 100644 --- a/docs/source/usage_guides/deepspeed.md +++ b/docs/source/usage_guides/deepspeed.md @@ -15,7 +15,7 @@ rendered properly in your Markdown viewer. # DeepSpeed -[DeepSpeed](https://github.com/microsoft/DeepSpeed) implements everything described in the [ZeRO paper](https://arxiv.org/abs/1910.02054). Currently, it provides full support for: +[DeepSpeed](https://github.com/microsoft/DeepSpeed) implements everything described in the [ZeRO paper](https://arxiv.org/abs/1910.02054). Some of the salient optimizations are: 1. Optimizer state partitioning (ZeRO stage 1) 2. Gradient partitioning (ZeRO stage 2) @@ -23,6 +23,7 @@ rendered properly in your Markdown viewer. 4. Custom mixed precision training handling 5. A range of fast CUDA-extension-based optimizers 6. ZeRO-Offload to CPU and Disk/NVMe +7. Heirarchical partitioning of model parameters (ZeRO++) ZeRO-Offload has its own dedicated paper: [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840). And NVMe-support is described in the paper [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857). @@ -44,7 +45,7 @@ won't be possible on a single GPU. Training: -1. DeepSpeed ZeRO training supports the full ZeRO stages 1, 2 and 3 as well as CPU/Disk offload of optimizer states, gradients and parameters. +1. 🤗 Accelerate integrates all features of DeepSpeed ZeRO. This includes all the ZeRO stages 1, 2 and 3 as well as ZeRO-Offload, ZeRO-Infinity (which can offload to disk/NVMe) and ZeRO++. Below is a short description of Data Parallelism using ZeRO - Zero Redundancy Optimizer along with diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) ![ZeRO Data Parallelism](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png) @@ -60,6 +61,8 @@ Below is a short description of Data Parallelism using ZeRO - Zero Redundancy Op e. **Param Offload**: Offloads the model parameters to CPU/Disk building on top of ZERO Stage 3 + f. **Heirarchical Paritioning**: Enables efficient multi-node training with data-parallel training across nodes and ZeRO-3 sharding within a node, built on top of ZeRO Stage 3. + Note: With respect to Disk Offload, the disk should be an NVME for decent speed but it technically works on any Disk Inference: @@ -349,6 +352,27 @@ accelerate launch examples/by_feature/deepspeed_with_config_support.py \ --report_to "wandb"\ ``` +**ZeRO++ Config Example** +You can use the the features of ZeRO++ by using the appropriate config parameters. Note that ZeRO++ is an extension for ZeRO Stage 3. Here is how the config file can be modified, from [DeepSpeed's ZeRO++ tutorial](https://www.deepspeed.ai/tutorials/zeropp/): + +```json +{ + "zero_optimization": { + "stage": 3, + "reduce_bucket_size": "auto", + + "zero_quantized_weights": true, + "zero_hpz_partition_size": 8, + "zero_quantized_gradients": true, + + "contiguous_gradients": true, + "overlap_comm": true + } +} +``` + +For heirarchical partitioning, the partition size `zero_hpz_partition_size` should ideally be set to the number of GPUs per node. (For example, the above config file assumes 8 GPUs per node) + **Important code changes when using DeepSpeed Config File** 1. DeepSpeed Optimizers and Schedulers. For more information on these, @@ -683,6 +707,8 @@ Papers: - [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054) - [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840) - [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857) +- [ZeRO++: Extremely Efficient Collective Communication for Giant Model Training](https://arxiv.org/abs/2306.10209) + Finally, please, remember that 🤗 `Accelerate` only integrates DeepSpeed, therefore if you have any problems or questions with regards to DeepSpeed usage, please, file an issue with [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues). From fbe00d7897c180a4ac67e5651ba263bb6d9400e8 Mon Sep 17 00:00:00 2001 From: Enming Yuan <2650576090@qq.com> Date: Mon, 20 Nov 2023 20:53:10 +0800 Subject: [PATCH 11/24] Update dataclasses.py (#2168) Bug fix: recompute_activation -> recompute_activations --- src/accelerate/utils/dataclasses.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py index e0e41568b0c..2d22d460aee 100644 --- a/src/accelerate/utils/dataclasses.py +++ b/src/accelerate/utils/dataclasses.py @@ -1043,7 +1043,7 @@ class MegatronLMPlugin: default=None, metadata={"help": "enable sequence parallelism"}, ) - recompute_activation: bool = field( + recompute_activations: bool = field( default=None, metadata={"help": "enable selective activation recomputation"}, ) @@ -1196,8 +1196,8 @@ def __post_init__(self): self.num_micro_batches = int(os.environ.get(prefix + "NUM_MICRO_BATCHES", 1)) if self.gradient_clipping is None: self.gradient_clipping = float(os.environ.get(prefix + "GRADIENT_CLIPPING", 1.0)) - if self.recompute_activation is None: - self.recompute_activation = str_to_bool(os.environ.get(prefix + "RECOMPUTE_ACTIVATION", "False")) == 1 + if self.recompute_activations is None: + self.recompute_activations = str_to_bool(os.environ.get(prefix + "RECOMPUTE_ACTIVATIONS", "False")) == 1 if self.use_distributed_optimizer is None: self.use_distributed_optimizer = ( str_to_bool(os.environ.get(prefix + "USE_DISTRIBUTED_OPTIMIZER", "False")) == 1 @@ -1234,7 +1234,7 @@ def __post_init__(self): "eval_iters": self.eval_iters, "eval_interval": self.eval_interval, } - if self.recompute_activation: + if self.recompute_activations: self.megatron_lm_default_args["recompute_granularity"] = "selective" if self.tensorboard_dir is not None: self.megatron_lm_default_args["tensorboard_dir"] = self.tensorboard_dir From 35b020635395e9834c645f06b39ae63e3d6799bf Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Mon, 20 Nov 2023 15:49:50 +0100 Subject: [PATCH 12/24] Fix non persistant buffer dispatch (#1941) * offload only persistant buffer * add tests and fix naming * remove_non_persistant=True by default * style * style again * fix hooks * fix logic --- src/accelerate/hooks.py | 18 +++++++++++---- src/accelerate/utils/modeling.py | 33 +++++++++++++++++++++++++-- tests/test_big_modeling.py | 39 ++++++++++++++++++++++++++++++++ tests/test_modeling_utils.py | 24 ++++++++++++++++++++ 4 files changed, 108 insertions(+), 6 deletions(-) diff --git a/src/accelerate/hooks.py b/src/accelerate/hooks.py index c033e890f2d..d87f1c18db3 100644 --- a/src/accelerate/hooks.py +++ b/src/accelerate/hooks.py @@ -26,6 +26,7 @@ send_to_device, set_module_tensor_to_device, ) +from .utils.modeling import get_non_persistent_buffers class ModelHook: @@ -262,14 +263,17 @@ def init_hook(self, module): module, include_buffers=self.offload_buffers, recurse=self.place_submodules ) } - for name, _ in named_module_tensors( - module, include_buffers=self.offload_buffers, recurse=self.place_submodules + module, include_buffers=self.offload_buffers, recurse=self.place_submodules, remove_non_persistent=True ): set_module_tensor_to_device(module, name, "meta") if not self.offload_buffers and self.execution_device is not None: for name, _ in module.named_buffers(recurse=self.place_submodules): set_module_tensor_to_device(module, name, self.execution_device) + elif self.offload_buffers and self.execution_device is not None: + for name in get_non_persistent_buffers(module, recurse=self.place_submodules): + set_module_tensor_to_device(module, name, self.execution_device) + return module def pre_forward(self, module, *args, **kwargs): @@ -277,7 +281,10 @@ def pre_forward(self, module, *args, **kwargs): self.input_device = find_device([args, kwargs]) if self.offload: for name, _ in named_module_tensors( - module, include_buffers=self.offload_buffers, recurse=self.place_submodules + module, + include_buffers=self.offload_buffers, + recurse=self.place_submodules, + remove_non_persistent=True, ): fp16_statistics = None if "weight" in name and name.replace("weight", "SCB") in self.weights_map.keys(): @@ -294,7 +301,10 @@ def pre_forward(self, module, *args, **kwargs): def post_forward(self, module, output): if self.offload: for name, _ in named_module_tensors( - module, include_buffers=self.offload_buffers, recurse=self.place_submodules + module, + include_buffers=self.offload_buffers, + recurse=self.place_submodules, + remove_non_persistent=True, ): set_module_tensor_to_device(module, name, "meta") if type(module).__name__ == "Linear8bitLt": diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py index fe8358d0756..85481eae283 100644 --- a/src/accelerate/utils/modeling.py +++ b/src/accelerate/utils/modeling.py @@ -365,7 +365,9 @@ def set_module_tensor_to_device( torch.cuda.empty_cache() -def named_module_tensors(module: nn.Module, include_buffers: bool = True, recurse: bool = False): +def named_module_tensors( + module: nn.Module, include_buffers: bool = True, recurse: bool = False, remove_non_persistent: bool = False +): """ A helper function that gathers all the tensors (parameters + buffers) of a given module. If `include_buffers=True` it's the same as doing `module.named_parameters(recurse=recurse) + module.named_buffers(recurse=recurse)`. @@ -377,13 +379,40 @@ def named_module_tensors(module: nn.Module, include_buffers: bool = True, recurs Whether or not to include the buffers in the result. recurse (`bool`, *optional`, defaults to `False`): Whether or not to go look in every submodule or just return the direct parameters and buffers. + remove_non_persistent (`bool`, *optional*, defaults to `False`): + Whether or not to remove the non persistent buffer from the buffers. Useful only when include_buffers = + True """ for named_parameter in module.named_parameters(recurse=recurse): yield named_parameter if include_buffers: + non_persistent_buffers = set() + if remove_non_persistent: + non_persistent_buffers = get_non_persistent_buffers(module, recurse=recurse) for named_buffer in module.named_buffers(recurse=recurse): - yield named_buffer + name, _ = named_buffer + if name not in non_persistent_buffers: + yield named_buffer + + +def get_non_persistent_buffers(module: nn.Module, recurse: bool = False): + """ + Gather all non persistent buffers of a given modules into a set + + Args: + module (`nn.Module`): + The module we want the non persistent buffers on. + recurse (`bool`, *optional*, defaults to `False`): + Whether or not to go look in every submodule or just return the direct non persistent buffers. + """ + + non_persistent_buffers_set = module._non_persistent_buffers_set + if recurse: + for _, m in module.named_modules(): + non_persistent_buffers_set |= m._non_persistent_buffers_set + + return non_persistent_buffers_set class FindTiedParametersResult(list): diff --git a/tests/test_big_modeling.py b/tests/test_big_modeling.py index 47aeb7146ff..51ce4a899e4 100644 --- a/tests/test_big_modeling.py +++ b/tests/test_big_modeling.py @@ -45,6 +45,33 @@ def forward(self, x): return self.linear2(self.batchnorm(self.linear1(x))) +class LinearWithNonPersistentBuffers(nn.Module): + def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.register_buffer("weight", torch.ones((out_features, in_features), **factory_kwargs)) + if bias: + self.register_buffer("bias", torch.ones(out_features, **factory_kwargs), persistent=False) + else: + self.register_buffer("bias", None) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return torch.nn.functional.linear(input, self.weight, self.bias) + + +class ModelForTestNonPersistentBuffers(nn.Module): + def __init__(self): + super().__init__() + self.linear1 = LinearWithNonPersistentBuffers(3, 4) + self.batchnorm = nn.BatchNorm1d(4) + self.linear2 = LinearWithNonPersistentBuffers(4, 5) + + def forward(self, x): + return self.linear2(self.batchnorm(self.linear1(x))) + + class ModelForTestCopy(nn.Module): def __init__(self, id: int): super().__init__() @@ -302,6 +329,18 @@ def test_dispatch_model(self): output = model(x) self.assertTrue(torch.allclose(expected, output.cpu(), atol=1e-5)) + @require_cuda + def test_dispatch_model_with_non_persistent_buffers(self): + model = ModelForTestNonPersistentBuffers() + device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": "disk"} + x = torch.randn(2, 3) + expected = model(x) + + with TemporaryDirectory() as tmp_dir: + dispatch_model(model, device_map, offload_dir=tmp_dir, offload_buffers=True) + output = model(x) + self.assertTrue(torch.allclose(expected, output.cpu(), atol=1e-5)) + @require_mps def test_dispatch_model_mps(self): model = ModelForTest() diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py index 7f7bf4c613a..d258938fe44 100644 --- a/tests/test_modeling_utils.py +++ b/tests/test_modeling_utils.py @@ -51,6 +51,22 @@ def forward(self, x): return self.linear2(self.batchnorm(self.linear1(x))) +class LinearWithNonPersistentBuffers(nn.Module): + def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.register_buffer("weight", torch.empty((out_features, in_features), **factory_kwargs)) + if bias: + self.register_buffer("bias", torch.empty(out_features, **factory_kwargs), persistent=False) + else: + self.register_buffer("bias", None) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return torch.nn.functional.linear(input, self.weight, self.bias) + + def sequential_model(num_layers): layers = OrderedDict([(f"linear{i}", nn.Linear(1000, 1000)) for i in range(1, num_layers + 1)]) return nn.Sequential(layers) @@ -187,6 +203,14 @@ def test_named_tensors(self): ["linear1.weight", "linear1.bias", "batchnorm.weight", "batchnorm.bias", "linear2.weight", "linear2.bias"], ) + model = LinearWithNonPersistentBuffers(10, 10) + + named_tensors = named_module_tensors(model, include_buffers=True, remove_non_persistent=False) + self.assertListEqual([name for name, _ in named_tensors], ["weight", "bias"]) + + named_tensors = named_module_tensors(model, include_buffers=True, remove_non_persistent=True) + self.assertListEqual([name for name, _ in named_tensors], ["weight"]) + def test_find_tied_parameters(self): model = sequential_model(4) self.assertListEqual(find_tied_parameters(model), []) From 427ef8bd009f25c5c1beabd25435266b2704caaf Mon Sep 17 00:00:00 2001 From: Antoni-Joan Solergibert <74564958+TJ-Solergibert@users.noreply.github.com> Date: Mon, 20 Nov 2023 16:42:49 +0100 Subject: [PATCH 13/24] Updated torchrun instructions (#2096) * Updated torchrun instructions * Update examples/README.md Co-authored-by: Benjamin Bossan * Update examples/README.md Co-authored-by: Benjamin Bossan * Update examples/README.md Co-authored-by: Benjamin Bossan * Update examples/README.md Co-authored-by: Benjamin Bossan * Update README.md for torchrun instructions * Added SLURM scripts and updated README * Update examples/Slurm/submit-multinode.sh Co-authored-by: Zach Mueller * Update examples/Slurm/submit-multiGPU.sh Co-authored-by: Zach Mueller * Update examples/README.md Co-authored-by: Zach Mueller * Update examples/README.md Co-authored-by: Zach Mueller * final details * modified argument parser * modified slurm multigpu script * modified multinode slurm script * Added accelerate multine issue * Update examples/README.md Co-authored-by: Zach Mueller * fixed readme commnad * added --main_process_port specification to readme * Revert "modified argument parser" This reverts commit c3bef5cdd11a8a120602b5b7ce158f7400881d7f. --------- Co-authored-by: Benjamin Bossan Co-authored-by: Zach Mueller --- examples/README.md | 53 +++++++++++++++--------------- examples/slurm/submit_multigpu.sh | 27 +++++++++++++++ examples/slurm/submit_multinode.sh | 41 +++++++++++++++++++++++ 3 files changed, 95 insertions(+), 26 deletions(-) create mode 100644 examples/slurm/submit_multigpu.sh create mode 100644 examples/slurm/submit_multinode.sh diff --git a/examples/README.md b/examples/README.md index f525607aad3..33e636abfba 100644 --- a/examples/README.md +++ b/examples/README.md @@ -64,9 +64,9 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on your server accelerate launch ./nlp_example.py # This will run the script on your server ``` - * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch) + * With traditional PyTorch launcher (`python -m torch.distributed.run` can be used instead of `torchrun`) ```bash - python -m torchrun --nproc_per_node 2 --use_env ./nlp_example.py + torchrun --nproc_per_node 2 ./nlp_example.py ``` - multi GPUs, multi node (several machines, using PyTorch distributed mode) * With Accelerate config and launcher, on each machine: @@ -74,18 +74,15 @@ To run it in each of these various modes, use the following commands: accelerate config # This will create a config file on each server accelerate launch ./nlp_example.py # This will run the script on each server ``` - * With PyTorch launcher only (`torch.distributed.launch` can be used in older versions of PyTorch) + * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node: ```bash - python -m torchrun --nproc_per_node 2 \ - --use_env \ - --node_rank 0 \ - --master_addr master_node_ip_address \ - ./nlp_example.py # On the first server - python -m torchrun --nproc_per_node 2 \ - --use_env \ - --node_rank 1 \ - --master_addr master_node_ip_address \ - ./nlp_example.py # On the second server + torchrun \ # python -m torch.distributed.run + --nproc_per_node 2 \ + --nnodes 2 \ + --rdzv_id 2299 \ # A unique job id + --rdzv_backend c10d \ + --rdzv_endpoint master_node_ip_address:29500 \ + ./nlp_example.py ``` - (multi) TPUs * With Accelerate config and launcher @@ -152,9 +149,9 @@ To run it in each of these various modes, use the following commands: accelerate config --config_file config.yaml # This will create a config file on your server to `config.yaml` accelerate launch --config_file config.yaml ./cv_example.py --data_dir path_to_data # This will run the script on your server ``` - * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch) + * With traditional PyTorch launcher (`python -m torch.distributed.run` can be used instead of `torchrun`) ```bash - python -m torchrun --nproc_per_node 2 --use_env ./cv_example.py --data_dir path_to_data + torchrun --nproc_per_node 2 ./cv_example.py --data_dir path_to_data ``` - multi GPUs, multi node (several machines, using PyTorch distributed mode) * With Accelerate config and launcher, on each machine: @@ -162,18 +159,15 @@ To run it in each of these various modes, use the following commands: accelerate config --config_file config.yaml # This will create a config file on your server to `config.yaml` accelerate launch --config_file config.yaml ./cv_example.py --data_dir path_to_data # This will run the script on each server ``` - * With PyTorch launcher only (`torch.distributed.launch` can be used with older versions of PyTorch) + * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node: ```bash - python -m torchrun --nproc_per_node 2 \ - --use_env \ - --node_rank 0 \ - --master_addr master_node_ip_address \ - ./cv_example.py --data_dir path_to_data # On the first server - python -m torchrun --nproc_per_node 2 \ - --use_env \ - --node_rank 1 \ - --master_addr master_node_ip_address \ - ./cv_example.py --data_dir path_to_data # On the second server + torchrun \ # python -m torch.distributed.run + --nproc_per_node 2 \ + --nnodes 2 \ + --rdzv_id 2299 \ # A unique job id + --rdzv_backend c10d \ + --rdzv_endpoint master_node_ip_address:29500 \ + ./cv_example.py --data_dir path_to_data ``` - (multi) TPUs * With Accelerate config and launcher @@ -206,6 +200,13 @@ with `pip install runhouse`, and you can refer to for hardware setup instructions, or this [Colab tutorial](https://colab.research.google.com/drive/1qVwYyLTCPYPSdz9ZX7BZl9Qm0A3j7RJe) for a more in-depth walkthrough. +## SLURM Scripts +In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) and [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we present two scripts for running the examples on a machine with [SLURM](https://slurm.schedmd.com/documentation.html) workload manager. + +In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. + +In [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many GPUs we will use in total (`--num_processes`), the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend), `--main_process_ip` which will be the address the master node and the `--main_process_port`. + ## Finer Examples While the first two scripts are extremely barebones when it comes to what you can do with accelerate, more advanced features are documented in two other locations. diff --git a/examples/slurm/submit_multigpu.sh b/examples/slurm/submit_multigpu.sh new file mode 100644 index 00000000000..709d7490064 --- /dev/null +++ b/examples/slurm/submit_multigpu.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +#SBATCH --job-name=multigpu +#SBATCH -D . +#SBATCH --output=O-%x.%j +#SBATCH --error=E-%x.%j +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # number of MP tasks +#SBATCH --gres=gpu:4 # number of GPUs per node +#SBATCH --cpus-per-task=160 # number of cores per tasks +#SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS) + +###################### +### Set enviroment ### +###################### +source activateEnviroment.sh +export GPUS_PER_NODE=4 +###################### + +export SCRIPT=/accelerate/examples/complete_nlp_example.py +export SCRIPT_ARGS=" \ + --mixed_precision fp16 \ + --output_dir /accelerate/examples/output \ + --with_tracking \ + " + +accelerate launch --num_processes $GPUS_PER_NODE $SCRIPT $SCRIPT_ARGS \ No newline at end of file diff --git a/examples/slurm/submit_multinode.sh b/examples/slurm/submit_multinode.sh new file mode 100644 index 00000000000..dfd3fa023c5 --- /dev/null +++ b/examples/slurm/submit_multinode.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +#SBATCH --job-name=multinode +#SBATCH -D . +#SBATCH --output=O-%x.%j +#SBATCH --error=E-%x.%j +#SBATCH --nodes=4 # number of nodes +#SBATCH --ntasks-per-node=1 # number of MP tasks +#SBATCH --gres=gpu:4 # number of GPUs per node +#SBATCH --cpus-per-task=160 # number of cores per tasks +#SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS) + +###################### +### Set enviroment ### +###################### +source activateEnviroment.sh +export GPUS_PER_NODE=4 +###################### + +###################### +#### Set network ##### +###################### +head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +###################### + +export LAUNCHER="accelerate launch \ + --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \ + --num_machines $SLURM_NNODES \ + --rdzv_backend c10d \ + --main_process_ip $head_node_ip \ + --main_process_port 29500 \ + " +export SCRIPT="/accelerate/examples/complete_nlp_example.py" +export SCRIPT_ARGS=" \ + --mixed_precision fp16 \ + --output_dir /accelerate/examples/output \ + " + +# This step is necessary because accelerate launch does not handle multiline arguments properly +export CMD="$LAUNCHER $PYTHON_FILE $ARGS" +srun $CMD \ No newline at end of file From ca300c0a04f843da2c5c8559e7d728926f7e8bf2 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Mon, 20 Nov 2023 11:41:57 -0500 Subject: [PATCH 14/24] New CI Runners (#2087) * Try merge tests * Fix * Checkout branch * Fix pip install * rebase * Colons * right one * use master * Rm * Add needs * Better clean * always * Forgot other * test on AWS * update all labels * fix multi-gpu working directory * limit to 2 GPU * force run on kube * move build docker image to new ci * test build on CPU instance * move build docker image release to new ci * move scheduled slow tests to new ci * move integration test to new ci * Comments * Right CPU tags * Right machines * PR comments --------- Co-authored-by: Guillaume LEGENDRE --- .../workflows/build-docker-images-release.yml | 4 +- .github/workflows/build_docker_images.yml | 15 +---- .github/workflows/nightly.yml | 23 +++++--- .github/workflows/run_merge_tests.yml | 59 +++++++++++-------- .../self_hosted_integration_tests.yml | 44 +++++++------- 5 files changed, 75 insertions(+), 70 deletions(-) diff --git a/.github/workflows/build-docker-images-release.yml b/.github/workflows/build-docker-images-release.yml index 8d5f2194b78..efb6a95da6c 100644 --- a/.github/workflows/build-docker-images-release.yml +++ b/.github/workflows/build-docker-images-release.yml @@ -21,7 +21,7 @@ jobs: version-cpu: name: "Latest Accelerate CPU [version]" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, intel-cpu, 8-cpu, ci] needs: get-version steps: - name: Set up Docker Buildx @@ -41,7 +41,7 @@ jobs: version-cuda: name: "Latest Accelerate GPU [version]" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci] needs: get-version steps: - name: Set up Docker Buildx diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index 59f3e4dda61..75b9fb9eefe 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -11,19 +11,9 @@ concurrency: cancel-in-progress: false jobs: - clean-storage: - name: "Clean docker image storage" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] - steps: - - name: Clean storage - run: | - docker image prune --all -f --filter "until=48h" - docker system prune --all -f --filter "until=48h" - latest-cpu: name: "Latest Accelerate CPU [dev]" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] - needs: clean-storage + runs-on: [self-hosted, intel-cpu, 8-cpu, ci] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 @@ -41,8 +31,7 @@ jobs: latest-cuda: name: "Latest Accelerate GPU [dev]" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] - needs: clean-storage + runs-on: [self-hosted, nvidia-gpu, t4, daily-ci] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 09e64e060e3..a06cae176c7 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -13,7 +13,7 @@ env: jobs: run_all_tests_single_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci] env: CUDA_VISIBLE_DEVICES: "0" TEST_TYPE: "single_gpu" @@ -22,23 +22,25 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: - working-directory: accelerate/ shell: bash steps: - name: Update clone & pip install run: | source activate accelerate - git config --global --add safe.directory '*' - git fetch && git checkout ${{ github.sha }} + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; pip install -e . --no-deps pip install pytest-reportlog tabulate - name: Run test on GPUs + working-directory: accelerate run: | source activate accelerate make test - name: Run examples on GPUs + working-directory: accelerate if: always() run: | source activate accelerate @@ -46,13 +48,14 @@ jobs: make test_examples - name: Generate Report + working-directory: accelerate if: always() run: | pip install slack_sdk tabulate python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_all_tests_multi_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, daily-ci] env: CUDA_VISIBLE_DEVICES: "0,1" TEST_TYPE: "multi_gpu" @@ -61,18 +64,19 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: - working-directory: accelerate/ shell: bash steps: - name: Update clone run: | source activate accelerate - git config --global --add safe.directory '*' - git fetch && git checkout ${{ github.sha }} + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; pip install -e . --no-deps pip install pytest-reportlog tabulate - name: Run core and big modeling tests on GPUs + working-directory: accelerate run: | source activate accelerate make test_core @@ -80,12 +84,14 @@ jobs: make test_cli - name: Run Integration tests on GPUs + working-directory: accelerate if: always() run: | source activate accelerate make test_integrations - name: Run examples on GPUs + working-directory: accelerate if: always() run: | source activate accelerate @@ -93,6 +99,7 @@ jobs: make test_examples - name: Generate Report + working-directory: accelerate if: always() run: | pip install slack_sdk tabulate diff --git a/.github/workflows/run_merge_tests.yml b/.github/workflows/run_merge_tests.yml index 7dacab8c508..42bf11e7516 100644 --- a/.github/workflows/run_merge_tests.yml +++ b/.github/workflows/run_merge_tests.yml @@ -10,7 +10,7 @@ env: jobs: run_all_tests_single_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, push-ci] env: CUDA_VISIBLE_DEVICES: "0" container: @@ -18,72 +18,81 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: - working-directory: accelerate/ shell: bash steps: - - name: Update clone & pip install + - name: Install accelerate run: | - source activate accelerate - git config --global --add safe.directory '*' - git fetch && git checkout ${{ github.sha }} - pip install -e .[testing,test_trackers] -U - pip install pytest-reportlog tabulate + source activate accelerate; + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; + pip install -e .[testing,test_trackers] -U; + pip install pytest-reportlog tabulate ; - - name: Run CLI tests + - name: Run CLI tests (use make cli) + working-directory: accelerate run: | - source activate accelerate + source activate accelerate; make test_cli - name: Run test on GPUs + working-directory: accelerate if: always() run: | - source activate accelerate + source activate accelerate; make test - name: Run examples on GPUs + working-directory: accelerate if: always() run: | - source activate accelerate - pip uninstall comet_ml -y + source activate accelerate; + pip uninstall comet_ml -y; make test_examples - name: Generate Report + working-directory: accelerate if: always() run: | - pip install tabulate + pip install tabulate; python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_all_tests_multi_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci] + env: + CUDA_VISIBLE_DEVICES: 0,1 container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" defaults: run: - working-directory: accelerate/ shell: bash steps: - name: Update clone run: | - source activate accelerate - git config --global --add safe.directory '*' - git fetch && git checkout ${{ github.sha }} - pip install -e .[testing,test_trackers] -U + source activate accelerate; + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; + pip install -e .[testing,test_trackers] -U; pip install pytest-reportlog tabulate - name: Run test on GPUs + working-directory: accelerate run: | - source activate accelerate + source activate accelerate; make test - name: Run examples on GPUs + working-directory: accelerate if: always() run: | - source activate accelerate - pip uninstall comet_ml -y + source activate accelerate; + pip uninstall comet_ml -y; make test_examples - name: Generate Report + working-directory: accelerate if: always() run: | - pip install tabulate - python utils/log_reports.py >> $GITHUB_STEP_SUMMARY \ No newline at end of file + source activate accelerate; + python utils/log_reports.py >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml index 94e50e61ff3..cd82295e4e2 100644 --- a/.github/workflows/self_hosted_integration_tests.yml +++ b/.github/workflows/self_hosted_integration_tests.yml @@ -25,7 +25,7 @@ jobs: container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci] strategy: fail-fast: false matrix: @@ -34,22 +34,22 @@ jobs: "0,1" ] steps: - - name: Update accelerate clone and pip install - working-directory: accelerate/ - run: + - name: Install transformers + run: | source activate accelerate; - git config --global --add safe.directory '*'; - git checkout main && git fetch && git checkout ${{ github.sha }}; - pip install -e .; + git clone https://github.com/huggingface/transformers --depth 1; + cd transformers; + pip install .[torch,deepspeed-testing]; + cd ..; - - name: Update transformers clone & pip install - working-directory: transformers/ + - name: Install accelerate run: | - source activate accelerate - git config --global --add safe.directory '*' - git checkout main && git pull - pip install .[torch,deepspeed-testing] - pip uninstall comet_ml wandb -y + source activate accelerate; + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }} ; + pip install -e .[testing]; + cd ..; - name: Show installed libraries run: | @@ -89,20 +89,20 @@ jobs: container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci] strategy: fail-fast: false steps: - - name: Update accelerate clone and pip install - working-directory: accelerate/ + - name: Install accelerate run: source activate accelerate; - git config --global --add safe.directory '*'; - git checkout main && git fetch && git checkout ${{ github.sha }}; - pip install -e .; + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; + pip install -e .[testing]; + cd .. - - name: Update skorch clone & pip install - working-directory: skorch/ + - name: Install skorch run: | source activate accelerate git config --global --add safe.directory '*' From 2b25b8b3c59b82e36d798537656f8d348bc86b6a Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Mon, 20 Nov 2023 12:06:33 -0500 Subject: [PATCH 15/24] Revert "New CI Runners (#2087)" (#2172) This reverts commit ca300c0a04f843da2c5c8559e7d728926f7e8bf2. --- .../workflows/build-docker-images-release.yml | 4 +- .github/workflows/build_docker_images.yml | 15 ++++- .github/workflows/nightly.yml | 23 +++----- .github/workflows/run_merge_tests.yml | 59 ++++++++----------- .../self_hosted_integration_tests.yml | 44 +++++++------- 5 files changed, 70 insertions(+), 75 deletions(-) diff --git a/.github/workflows/build-docker-images-release.yml b/.github/workflows/build-docker-images-release.yml index efb6a95da6c..8d5f2194b78 100644 --- a/.github/workflows/build-docker-images-release.yml +++ b/.github/workflows/build-docker-images-release.yml @@ -21,7 +21,7 @@ jobs: version-cpu: name: "Latest Accelerate CPU [version]" - runs-on: [self-hosted, intel-cpu, 8-cpu, ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] needs: get-version steps: - name: Set up Docker Buildx @@ -41,7 +41,7 @@ jobs: version-cuda: name: "Latest Accelerate GPU [version]" - runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] needs: get-version steps: - name: Set up Docker Buildx diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index 75b9fb9eefe..59f3e4dda61 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -11,9 +11,19 @@ concurrency: cancel-in-progress: false jobs: + clean-storage: + name: "Clean docker image storage" + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + steps: + - name: Clean storage + run: | + docker image prune --all -f --filter "until=48h" + docker system prune --all -f --filter "until=48h" + latest-cpu: name: "Latest Accelerate CPU [dev]" - runs-on: [self-hosted, intel-cpu, 8-cpu, ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + needs: clean-storage steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 @@ -31,7 +41,8 @@ jobs: latest-cuda: name: "Latest Accelerate GPU [dev]" - runs-on: [self-hosted, nvidia-gpu, t4, daily-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + needs: clean-storage steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index a06cae176c7..09e64e060e3 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -13,7 +13,7 @@ env: jobs: run_all_tests_single_gpu: - runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] env: CUDA_VISIBLE_DEVICES: "0" TEST_TYPE: "single_gpu" @@ -22,25 +22,23 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: + working-directory: accelerate/ shell: bash steps: - name: Update clone & pip install run: | source activate accelerate - git clone https://github.com/huggingface/accelerate; - cd accelerate; - git checkout ${{ github.sha }}; + git config --global --add safe.directory '*' + git fetch && git checkout ${{ github.sha }} pip install -e . --no-deps pip install pytest-reportlog tabulate - name: Run test on GPUs - working-directory: accelerate run: | source activate accelerate make test - name: Run examples on GPUs - working-directory: accelerate if: always() run: | source activate accelerate @@ -48,14 +46,13 @@ jobs: make test_examples - name: Generate Report - working-directory: accelerate if: always() run: | pip install slack_sdk tabulate python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_all_tests_multi_gpu: - runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, daily-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] env: CUDA_VISIBLE_DEVICES: "0,1" TEST_TYPE: "multi_gpu" @@ -64,19 +61,18 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: + working-directory: accelerate/ shell: bash steps: - name: Update clone run: | source activate accelerate - git clone https://github.com/huggingface/accelerate; - cd accelerate; - git checkout ${{ github.sha }}; + git config --global --add safe.directory '*' + git fetch && git checkout ${{ github.sha }} pip install -e . --no-deps pip install pytest-reportlog tabulate - name: Run core and big modeling tests on GPUs - working-directory: accelerate run: | source activate accelerate make test_core @@ -84,14 +80,12 @@ jobs: make test_cli - name: Run Integration tests on GPUs - working-directory: accelerate if: always() run: | source activate accelerate make test_integrations - name: Run examples on GPUs - working-directory: accelerate if: always() run: | source activate accelerate @@ -99,7 +93,6 @@ jobs: make test_examples - name: Generate Report - working-directory: accelerate if: always() run: | pip install slack_sdk tabulate diff --git a/.github/workflows/run_merge_tests.yml b/.github/workflows/run_merge_tests.yml index 42bf11e7516..7dacab8c508 100644 --- a/.github/workflows/run_merge_tests.yml +++ b/.github/workflows/run_merge_tests.yml @@ -10,7 +10,7 @@ env: jobs: run_all_tests_single_gpu: - runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, push-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] env: CUDA_VISIBLE_DEVICES: "0" container: @@ -18,81 +18,72 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: + working-directory: accelerate/ shell: bash steps: - - name: Install accelerate + - name: Update clone & pip install run: | - source activate accelerate; - git clone https://github.com/huggingface/accelerate; - cd accelerate; - git checkout ${{ github.sha }}; - pip install -e .[testing,test_trackers] -U; - pip install pytest-reportlog tabulate ; + source activate accelerate + git config --global --add safe.directory '*' + git fetch && git checkout ${{ github.sha }} + pip install -e .[testing,test_trackers] -U + pip install pytest-reportlog tabulate - - name: Run CLI tests (use make cli) - working-directory: accelerate + - name: Run CLI tests run: | - source activate accelerate; + source activate accelerate make test_cli - name: Run test on GPUs - working-directory: accelerate if: always() run: | - source activate accelerate; + source activate accelerate make test - name: Run examples on GPUs - working-directory: accelerate if: always() run: | - source activate accelerate; - pip uninstall comet_ml -y; + source activate accelerate + pip uninstall comet_ml -y make test_examples - name: Generate Report - working-directory: accelerate if: always() run: | - pip install tabulate; + pip install tabulate python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_all_tests_multi_gpu: - runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci] - env: - CUDA_VISIBLE_DEVICES: 0,1 + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" defaults: run: + working-directory: accelerate/ shell: bash steps: - name: Update clone run: | - source activate accelerate; - git clone https://github.com/huggingface/accelerate; - cd accelerate; - git checkout ${{ github.sha }}; - pip install -e .[testing,test_trackers] -U; + source activate accelerate + git config --global --add safe.directory '*' + git fetch && git checkout ${{ github.sha }} + pip install -e .[testing,test_trackers] -U pip install pytest-reportlog tabulate - name: Run test on GPUs - working-directory: accelerate run: | - source activate accelerate; + source activate accelerate make test - name: Run examples on GPUs - working-directory: accelerate if: always() run: | - source activate accelerate; - pip uninstall comet_ml -y; + source activate accelerate + pip uninstall comet_ml -y make test_examples - name: Generate Report - working-directory: accelerate if: always() run: | - source activate accelerate; - python utils/log_reports.py >> $GITHUB_STEP_SUMMARY + pip install tabulate + python utils/log_reports.py >> $GITHUB_STEP_SUMMARY \ No newline at end of file diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml index cd82295e4e2..94e50e61ff3 100644 --- a/.github/workflows/self_hosted_integration_tests.yml +++ b/.github/workflows/self_hosted_integration_tests.yml @@ -25,7 +25,7 @@ jobs: container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" - runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] strategy: fail-fast: false matrix: @@ -34,22 +34,22 @@ jobs: "0,1" ] steps: - - name: Install transformers - run: | + - name: Update accelerate clone and pip install + working-directory: accelerate/ + run: source activate accelerate; - git clone https://github.com/huggingface/transformers --depth 1; - cd transformers; - pip install .[torch,deepspeed-testing]; - cd ..; + git config --global --add safe.directory '*'; + git checkout main && git fetch && git checkout ${{ github.sha }}; + pip install -e .; - - name: Install accelerate + - name: Update transformers clone & pip install + working-directory: transformers/ run: | - source activate accelerate; - git clone https://github.com/huggingface/accelerate; - cd accelerate; - git checkout ${{ github.sha }} ; - pip install -e .[testing]; - cd ..; + source activate accelerate + git config --global --add safe.directory '*' + git checkout main && git pull + pip install .[torch,deepspeed-testing] + pip uninstall comet_ml wandb -y - name: Show installed libraries run: | @@ -89,20 +89,20 @@ jobs: container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" - runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci] + runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] strategy: fail-fast: false steps: - - name: Install accelerate + - name: Update accelerate clone and pip install + working-directory: accelerate/ run: source activate accelerate; - git clone https://github.com/huggingface/accelerate; - cd accelerate; - git checkout ${{ github.sha }}; - pip install -e .[testing]; - cd .. + git config --global --add safe.directory '*'; + git checkout main && git fetch && git checkout ${{ github.sha }}; + pip install -e .; - - name: Install skorch + - name: Update skorch clone & pip install + working-directory: skorch/ run: | source activate accelerate git config --global --add safe.directory '*' From 1243191ecbbd6a91a221e3cb56276b8e95f6c028 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Mon, 20 Nov 2023 13:01:12 -0500 Subject: [PATCH 16/24] [Working again] New CI (#2173) * Try merge tests * Fix * Checkout branch * Fix pip install * rebase * Colons * right one * use master * Rm * Add needs * Better clean * always * Forgot other * test on AWS * update all labels * fix multi-gpu working directory * limit to 2 GPU * force run on kube * move build docker image to new ci * test build on CPU instance * move build docker image release to new ci * move scheduled slow tests to new ci * move integration test to new ci * Comments * Right CPU tags * Right machines * PR comments * Fix issues * Some trailers --------- Co-authored-by: Guillaume LEGENDRE --- .../workflows/build-docker-images-release.yml | 4 +- .github/workflows/build_docker_images.yml | 15 +---- .github/workflows/nightly.yml | 23 +++++--- .github/workflows/run_merge_tests.yml | 59 +++++++++++-------- .../self_hosted_integration_tests.yml | 44 +++++++------- 5 files changed, 75 insertions(+), 70 deletions(-) diff --git a/.github/workflows/build-docker-images-release.yml b/.github/workflows/build-docker-images-release.yml index 8d5f2194b78..20a9ea51e5a 100644 --- a/.github/workflows/build-docker-images-release.yml +++ b/.github/workflows/build-docker-images-release.yml @@ -21,7 +21,7 @@ jobs: version-cpu: name: "Latest Accelerate CPU [version]" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, intel-cpu, 8-cpu, ci] needs: get-version steps: - name: Set up Docker Buildx @@ -41,7 +41,7 @@ jobs: version-cuda: name: "Latest Accelerate GPU [version]" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci] needs: get-version steps: - name: Set up Docker Buildx diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index 59f3e4dda61..557032a0de2 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -11,19 +11,9 @@ concurrency: cancel-in-progress: false jobs: - clean-storage: - name: "Clean docker image storage" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] - steps: - - name: Clean storage - run: | - docker image prune --all -f --filter "until=48h" - docker system prune --all -f --filter "until=48h" - latest-cpu: name: "Latest Accelerate CPU [dev]" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] - needs: clean-storage + runs-on: [self-hosted, intel-cpu, 8-cpu, ci] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 @@ -41,8 +31,7 @@ jobs: latest-cuda: name: "Latest Accelerate GPU [dev]" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] - needs: clean-storage + runs-on: [self-hosted, nvidia-gpu, t4, ci] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 09e64e060e3..3e7f77bf9fd 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -13,7 +13,7 @@ env: jobs: run_all_tests_single_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci] env: CUDA_VISIBLE_DEVICES: "0" TEST_TYPE: "single_gpu" @@ -22,23 +22,25 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: - working-directory: accelerate/ shell: bash steps: - name: Update clone & pip install run: | source activate accelerate - git config --global --add safe.directory '*' - git fetch && git checkout ${{ github.sha }} + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; pip install -e . --no-deps pip install pytest-reportlog tabulate - name: Run test on GPUs + working-directory: accelerate run: | source activate accelerate make test - name: Run examples on GPUs + working-directory: accelerate if: always() run: | source activate accelerate @@ -46,13 +48,14 @@ jobs: make test_examples - name: Generate Report + working-directory: accelerate if: always() run: | pip install slack_sdk tabulate python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_all_tests_multi_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci] env: CUDA_VISIBLE_DEVICES: "0,1" TEST_TYPE: "multi_gpu" @@ -61,18 +64,19 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: - working-directory: accelerate/ shell: bash steps: - name: Update clone run: | source activate accelerate - git config --global --add safe.directory '*' - git fetch && git checkout ${{ github.sha }} + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; pip install -e . --no-deps pip install pytest-reportlog tabulate - name: Run core and big modeling tests on GPUs + working-directory: accelerate run: | source activate accelerate make test_core @@ -80,12 +84,14 @@ jobs: make test_cli - name: Run Integration tests on GPUs + working-directory: accelerate if: always() run: | source activate accelerate make test_integrations - name: Run examples on GPUs + working-directory: accelerate if: always() run: | source activate accelerate @@ -93,6 +99,7 @@ jobs: make test_examples - name: Generate Report + working-directory: accelerate if: always() run: | pip install slack_sdk tabulate diff --git a/.github/workflows/run_merge_tests.yml b/.github/workflows/run_merge_tests.yml index 7dacab8c508..ef6ea84bc7e 100644 --- a/.github/workflows/run_merge_tests.yml +++ b/.github/workflows/run_merge_tests.yml @@ -10,7 +10,7 @@ env: jobs: run_all_tests_single_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci] env: CUDA_VISIBLE_DEVICES: "0" container: @@ -18,72 +18,81 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: - working-directory: accelerate/ shell: bash steps: - - name: Update clone & pip install + - name: Install accelerate run: | - source activate accelerate - git config --global --add safe.directory '*' - git fetch && git checkout ${{ github.sha }} - pip install -e .[testing,test_trackers] -U - pip install pytest-reportlog tabulate + source activate accelerate; + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; + pip install -e .[testing,test_trackers] -U; + pip install pytest-reportlog tabulate ; - - name: Run CLI tests + - name: Run CLI tests (use make cli) + working-directory: accelerate run: | - source activate accelerate + source activate accelerate; make test_cli - name: Run test on GPUs + working-directory: accelerate if: always() run: | - source activate accelerate + source activate accelerate; make test - name: Run examples on GPUs + working-directory: accelerate if: always() run: | - source activate accelerate - pip uninstall comet_ml -y + source activate accelerate; + pip uninstall comet_ml -y; make test_examples - name: Generate Report + working-directory: accelerate if: always() run: | - pip install tabulate + pip install tabulate; python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_all_tests_multi_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci] + env: + CUDA_VISIBLE_DEVICES: 0,1 container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" defaults: run: - working-directory: accelerate/ shell: bash steps: - name: Update clone run: | - source activate accelerate - git config --global --add safe.directory '*' - git fetch && git checkout ${{ github.sha }} - pip install -e .[testing,test_trackers] -U + source activate accelerate; + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; + pip install -e .[testing,test_trackers] -U; pip install pytest-reportlog tabulate - name: Run test on GPUs + working-directory: accelerate run: | - source activate accelerate + source activate accelerate; make test - name: Run examples on GPUs + working-directory: accelerate if: always() run: | - source activate accelerate - pip uninstall comet_ml -y + source activate accelerate; + pip uninstall comet_ml -y; make test_examples - name: Generate Report + working-directory: accelerate if: always() run: | - pip install tabulate - python utils/log_reports.py >> $GITHUB_STEP_SUMMARY \ No newline at end of file + source activate accelerate; + python utils/log_reports.py >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml index 94e50e61ff3..3c12b51e259 100644 --- a/.github/workflows/self_hosted_integration_tests.yml +++ b/.github/workflows/self_hosted_integration_tests.yml @@ -25,7 +25,7 @@ jobs: container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci] strategy: fail-fast: false matrix: @@ -34,22 +34,22 @@ jobs: "0,1" ] steps: - - name: Update accelerate clone and pip install - working-directory: accelerate/ - run: + - name: Install transformers + run: | source activate accelerate; - git config --global --add safe.directory '*'; - git checkout main && git fetch && git checkout ${{ github.sha }}; - pip install -e .; + git clone https://github.com/huggingface/transformers --depth 1; + cd transformers; + pip install .[torch,deepspeed-testing]; + cd ..; - - name: Update transformers clone & pip install - working-directory: transformers/ + - name: Install accelerate run: | - source activate accelerate - git config --global --add safe.directory '*' - git checkout main && git pull - pip install .[torch,deepspeed-testing] - pip uninstall comet_ml wandb -y + source activate accelerate; + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }} ; + pip install -e .[testing]; + cd ..; - name: Show installed libraries run: | @@ -89,20 +89,20 @@ jobs: container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci] strategy: fail-fast: false steps: - - name: Update accelerate clone and pip install - working-directory: accelerate/ + - name: Install accelerate run: source activate accelerate; - git config --global --add safe.directory '*'; - git checkout main && git fetch && git checkout ${{ github.sha }}; - pip install -e .; + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; + pip install -e .[testing]; + cd .. - - name: Update skorch clone & pip install - working-directory: skorch/ + - name: Install skorch run: | source activate accelerate git config --global --add safe.directory '*' From b8ca803f98430048ff479bdc351381871c0074a5 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Mon, 20 Nov 2023 13:11:08 -0500 Subject: [PATCH 17/24] Don't make it wait --- .github/workflows/build_and_run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_run_tests.yml b/.github/workflows/build_and_run_tests.yml index 1f29e189505..fffda0dc921 100644 --- a/.github/workflows/build_and_run_tests.yml +++ b/.github/workflows/build_and_run_tests.yml @@ -45,6 +45,6 @@ jobs: uses: ./.github/workflows/run_merge_tests.yml run-integration-tests: - needs: run-merge-tests + needs: build-docker-containers if: always() uses: ./.github/workflows/self_hosted_integration_tests.yml \ No newline at end of file From 7d430cf8dec31848dc82e3942f4881801c639032 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Mon, 20 Nov 2023 13:30:23 -0500 Subject: [PATCH 18/24] skorch --- .github/workflows/self_hosted_integration_tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml index 3c12b51e259..ca46fe2e520 100644 --- a/.github/workflows/self_hosted_integration_tests.yml +++ b/.github/workflows/self_hosted_integration_tests.yml @@ -105,6 +105,8 @@ jobs: - name: Install skorch run: | source activate accelerate + git clone https://github.com/huggingface/skorch; + cd skorch; git config --global --add safe.directory '*' git checkout master && git pull pip install .[testing] From 0e51680994f5b9085de615604661d9022439bd0b Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Mon, 20 Nov 2023 14:03:49 -0500 Subject: [PATCH 19/24] Right URL --- .github/workflows/self_hosted_integration_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml index ca46fe2e520..a28469f5193 100644 --- a/.github/workflows/self_hosted_integration_tests.yml +++ b/.github/workflows/self_hosted_integration_tests.yml @@ -105,7 +105,7 @@ jobs: - name: Install skorch run: | source activate accelerate - git clone https://github.com/huggingface/skorch; + git clone https://github.com/skorch-dev/skorch; cd skorch; git config --global --add safe.directory '*' git checkout master && git pull From 1aeb1e8997bd393c0cc9943752b061e9a4bc67ef Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Tue, 21 Nov 2023 08:41:57 -0500 Subject: [PATCH 20/24] Don't make integration tests wait --- .github/workflows/nightly.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 3e7f77bf9fd..72450599515 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -107,6 +107,5 @@ jobs: run-integration-tests: - needs: [run_all_tests_single_gpu, run_all_tests_multi_gpu] if: always() uses: ./.github/workflows/self_hosted_integration_tests.yml \ No newline at end of file From d25efa71ce76a5f5911a1fc6c039979d7248596f Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Tue, 21 Nov 2023 09:54:33 -0500 Subject: [PATCH 21/24] Don't install comet --- .github/workflows/self_hosted_integration_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml index a28469f5193..42e26bfc1bd 100644 --- a/.github/workflows/self_hosted_integration_tests.yml +++ b/.github/workflows/self_hosted_integration_tests.yml @@ -49,6 +49,7 @@ jobs: cd accelerate; git checkout ${{ github.sha }} ; pip install -e .[testing]; + pip uninstall comet_ml wandb -y cd ..; - name: Show installed libraries From 244122c736141b164242084c659b6dafa4208fea Mon Sep 17 00:00:00 2001 From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Date: Fri, 24 Nov 2023 09:31:57 +0530 Subject: [PATCH 22/24] fsdp refactoring (#2177) * remove the redundant code post the torch 2.1 release * make `use_orig_params=True` by default. * fix `save_state` optimizer saving for fsdp and update the fsdp example * quality * fixing the utils and tests. Updating the docs * bump up the minimum version for FSDP support. * address comment * rename fsdp model checkpointing variables --- docs/source/usage_guides/fsdp.md | 82 ++++------------- .../by_feature/fsdp_with_peak_mem_tracking.py | 32 +++---- src/accelerate/accelerator.py | 92 ++----------------- src/accelerate/commands/config/cluster.py | 4 +- src/accelerate/commands/launch.py | 2 +- src/accelerate/utils/constants.py | 3 +- src/accelerate/utils/dataclasses.py | 2 +- src/accelerate/utils/fsdp_utils.py | 38 ++++---- tests/fsdp/test_fsdp.py | 5 + 9 files changed, 70 insertions(+), 190 deletions(-) diff --git a/docs/source/usage_guides/fsdp.md b/docs/source/usage_guides/fsdp.md index a57a4bf6801..96385a38178 100644 --- a/docs/source/usage_guides/fsdp.md +++ b/docs/source/usage_guides/fsdp.md @@ -40,23 +40,30 @@ For instance, here is how you would run the NLP example (from the root of the re ```bash compute_environment: LOCAL_MACHINE -deepspeed_config: {} +debug: false distributed_type: FSDP downcast_bf16: 'no' fsdp_config: fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_backward_prefetch_policy: BACKWARD_PRE + fsdp_cpu_ram_efficient_loading: true + fsdp_forward_prefetch: false fsdp_offload_params: false fsdp_sharding_strategy: 1 - fsdp_state_dict_type: FULL_STATE_DICT + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_sync_module_states: true fsdp_transformer_layer_cls_to_wrap: BertLayer + fsdp_use_orig_params: true machine_rank: 0 -main_process_ip: null -main_process_port: null main_training_function: main -mixed_precision: 'no' +mixed_precision: bf16 num_machines: 1 num_processes: 2 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false use_cpu: false ``` @@ -66,7 +73,7 @@ accelerate launch examples/nlp_example.py Currently, `Accelerate` supports the following config through the CLI: -```bash + `Sharding Strategy`: [1] FULL_SHARD (shards optimizer states, gradients and parameters), [2] SHARD_GRAD_OP (shards optimizer states and gradients), [3] NO_SHARD (DDP), [4] HYBRID_SHARD (shards optimizer states, gradients and parameters within each node while each node has full copy), [5] HYBRID_SHARD_ZERO2 (shards optimizer states and gradients within each node while each node has full copy) `Offload Params`: Decides Whether to offload parameters and gradients to CPU @@ -94,12 +101,12 @@ all-gather while executing in the forward pass. only use with Static graphs. `Use Orig Params`: If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. -Please refer this [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019) +Please refer this [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019). This also enables to have different optimizer param groups. This should be `True` when creating optimizer object before preparing/wrapping the model with FSDP. `CPU RAM Efficient Model loading`: If True, only the first process loads the pretrained model checkoint while all other processes have empty weights. Only applicable for 🤗 Transformers models. This should be set to False if you experience errors when loading the pretrained 🤗 Transformers model via `from_pretrained` method. When using this, `Sync Module States` needs to be True else all the processes expect the main process would have random empty weights leading to unexpected behaviour during training. `Sync Module States`: If True, each individually wrapped FSDP unit will broadcast module parameters from rank 0 -``` + For additional and more nuanced control, you can specify other FSDP parameters via `FullyShardedDataParallelPlugin`. When creating `FullyShardedDataParallelPlugin` object, pass it the parameters that weren't part of the accelerate config or if you want to override them. @@ -156,72 +163,19 @@ When using transformers `save_pretrained`, pass `state_dict=accelerator.get_stat args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save, -+ state_dict=accelerator.get_state_dict(model, unwrap=False), ++ state_dict=accelerator.get_state_dict(model), ) ``` ### State Dict -`accelerator.get_state_dict` will call the underlying `model.state_dict` implementation. With a model wrapped by FSDP, the default behavior of `state_dict` is to gather all of the state in the rank 0 device. This can cause CUDA out of memory errors if the parameters don't fit on a single GPU. - -To avoid this, PyTorch provides a context manager that adjusts the behavior of `state_dict`. To offload some of the state dict onto CPU, you can use the following code: - -``` -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType, FullStateDictConfig - -full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) -with FSDP.state_dict_type(unwrapped_model, StateDictType.FULL_STATE_DICT, full_state_dict_config): - state = accelerator.get_state_dict(unwrapped_model) -``` +`accelerator.get_state_dict` will call the underlying `model.state_dict` implementation using `FullStateDictConfig(offload_to_cpu=True, rank0_only=True)` context manager to get the state dict only for rank 0 and it will be offloaded to CPU. You can then pass `state` into the `save_pretrained` method. There are several modes for `StateDictType` and `FullStateDictConfig` that you can use to control the behavior of `state_dict`. For more information, see the [PyTorch documentation](https://pytorch.org/docs/stable/fsdp.html). ## A few caveats to be aware of -- PyTorch FSDP auto wraps sub-modules, flattens the parameters and shards the parameters in place. - Due to this, any optimizer created before model wrapping gets broken and occupies more memory. - Hence, it is highly recommended and efficient to prepare the model before creating the optimizer. - `Accelerate` will automatically wrap the model and create an optimizer for you in case of single model with a warning message. - > FSDP Warning: When using FSDP, it is efficient and recommended to call prepare for the model before creating the optimizer - -However, below is the recommended way to prepare model and optimizer while using FSDP: - -```diff - model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True) -+ model = accelerator.prepare(model) - - optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr) - -- model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( -- model, optimizer, train_dataloader, eval_dataloader, lr_scheduler -- ) - -+ optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( -+ optimizer, train_dataloader, eval_dataloader, lr_scheduler -+ ) -``` - -- In case of a single model, if you have created the optimizer with multiple parameter groups and called prepare with them together, - then the parameter groups will be lost and the following warning is displayed: - > FSDP Warning: When using FSDP, several parameter groups will be conflated into - > a single one due to nested module wrapping and parameter flattening. - - This is because parameter groups created before wrapping will have no meaning post wrapping due to parameter flattening of nested FSDP modules into 1D arrays (which can consume many layers). - For instance, below are the named parameters of an FSDP model on GPU 0 (When using 2 GPUs. Around 55M (110M/2) params in 1D arrays as this will have the 1st shard of the parameters). - Here, if one has applied no weight decay for [bias, LayerNorm.weight] the named parameters of an unwrapped BERT model, - it can't be applied to the below FSDP wrapped model as there are no named parameters with either of those strings and - the parameters of those layers are concatenated with parameters of various other layers. - ``` - { - '_fsdp_wrapped_module.flat_param': torch.Size([494209]), - '_fsdp_wrapped_module._fpw_module.bert.embeddings.word_embeddings._fsdp_wrapped_module.flat_param': torch.Size([11720448]), - '_fsdp_wrapped_module._fpw_module.bert.encoder._fsdp_wrapped_module.flat_param': torch.Size([42527232]) - } - ``` - - -- In case of multiple models, it is necessary to prepare the models before creating optimizers or else it will throw an error. -Then pass the optimizers to the prepare call in the same order as corresponding models else `accelerator.save_state()` and `accelerator.load_state()` will result in wrong/unexpected behaviour. +- In case of multiple models, pass the optimizers to the prepare call in the same order as corresponding models else `accelerator.save_state()` and `accelerator.load_state()` will result in wrong/unexpected behaviour. - This feature is incompatible with `--predict_with_generate` in the `run_translation.py` script of 🤗 `Transformers` library. For more control, users can leverage the `FullyShardedDataParallelPlugin`. After creating an instance of this class, users can pass it to the Accelerator class instantiation. diff --git a/examples/by_feature/fsdp_with_peak_mem_tracking.py b/examples/by_feature/fsdp_with_peak_mem_tracking.py index 8abe3278953..22c87ada540 100644 --- a/examples/by_feature/fsdp_with_peak_mem_tracking.py +++ b/examples/by_feature/fsdp_with_peak_mem_tracking.py @@ -247,16 +247,19 @@ def collate_fn(examples): args.model_name_or_path, return_dict=True, low_cpu_mem_usage=True ) - # New Code # - # For FSDP feature, it is highly recommended and efficient to prepare the model before creating optimizer - model = accelerator.prepare(model) - accelerator.print(model) - - # Instantiate optimizer - # New Code # - # For FSDP feature, at present it doesn't support multiple parameter groups, - # so we need to create a single parameter group for the whole model - optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr, weight_decay=2e-4) + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": 0.003, + }, + { + "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + + optimizer = torch.optim.AdamW(params=optimizer_grouped_parameters, lr=lr, weight_decay=2e-4) # Instantiate scheduler lr_scheduler = get_linear_schedule_with_warmup( @@ -265,13 +268,8 @@ def collate_fn(examples): num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps, ) - # New Code # - # For FSDP feature, prepare everything except the model as we have already prepared the model - # before creating the optimizer - # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the - # prepare method. - optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( - optimizer, train_dataloader, eval_dataloader, lr_scheduler + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler ) overall_step = 0 diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index c464cbd6f28..55aef1a366b 100755 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -1100,52 +1100,6 @@ def _prepare_one(self, obj, first_pass=False, device_placement=None): # Return the unprocessed object if previous criteria was not met return obj - def _prepare_fsdp(self, *args): - result = [] - for obj in args: - if isinstance(obj, torch.nn.Module): - model = obj - break - optimizers = [] - - self._schedulers = [] - self._models = [] - intermediate_result = [] - for obj in args: - if isinstance(obj, torch.optim.Optimizer): - if len(obj.param_groups) > 1: - logger.warning( - "FSDP Warning: When using FSDP, several parameter groups will be conflated into " - "a single one due to nested module wrapping and parameter flattening." - ) - try: - optimizer = obj.optimizer.__class__(model.parameters(), **obj.optimizer.defaults) - except TypeError: - if "differentiable" in obj.optimizer.defaults: - # https://github.com/huggingface/accelerate/issues/801 - defaults = {k: v for k, v in obj.optimizer.defaults.items() if k != "differentiable"} - optimizer = obj.optimizer.__class__(model.parameters(), **defaults) - else: - raise - obj = self.prepare_optimizer(optimizer) - optimizers.append(obj) - elif isinstance(obj, torch.nn.Module): - self._models.append(obj) - intermediate_result.append(obj) - - for obj in intermediate_result: - if isinstance(obj, AcceleratedScheduler): - obj.optimizer = optimizers - for i, opt in enumerate(self._optimizers): - if getattr(obj.scheduler, "optimizer", None) == opt.optimizer: - obj.scheduler.optimizer = optimizers[i] - obj.optimizers = [optimizers[i]] - break - self._schedulers.append(obj) - result.append(obj) - self._optimizers = optimizers - return tuple(result) - def prepare(self, *args, device_placement=None): """ Prepare all objects passed in `args` for distributed training and mixed precision, then return them in the same @@ -1214,35 +1168,6 @@ def prepare(self, *args, device_placement=None): " Please rerun your script specifying `--num_processes=1` or by launching with `python {{myscript.py}}`." ) - if self.distributed_type == DistributedType.FSDP: - from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP - - model_count = 0 - optimizer_present = False - is_type_fsdp = False - for obj in args: - if isinstance(obj, torch.nn.Module): - model_count += 1 - # if the model is compiled using PyTorch 2.0, - # check that the wrapped model is FSDP or not; - # else check if it is FSDP or not; - is_type_fsdp = isinstance(obj, FSDP) or ( - is_compiled_module(obj) and isinstance(obj._orig_mod, FSDP) - ) - if isinstance(obj, torch.optim.Optimizer): - optimizer_present = True - if model_count > 1 and optimizer_present: - raise ValueError( - "For FSDP to work with multiple models (>1), " - "prepare must be called for all the models before optimizers are created. " - "Then pass the optimizers to the prepare call in the same order as corresponding models." - ) - elif model_count == 1 and not is_type_fsdp and optimizer_present: - logger.warning( - "FSDP Warning: When using FSDP, " - "it is efficient and recommended to call prepare for the model before creating the optimizer" - ) - if self.distributed_type == DistributedType.DEEPSPEED: model_count = 0 for obj in args: @@ -1298,14 +1223,6 @@ def prepare(self, *args, device_placement=None): if isinstance(obj, torch.optim.Optimizer): obj._switch_parameters(mapping) - if ( - self.distributed_type == DistributedType.FSDP - and model_count == 1 - and not is_type_fsdp - and optimizer_present - ): - result = self._prepare_fsdp(*result) - for item in result: if any( item in container @@ -2753,7 +2670,7 @@ def _inner(folder): # Save the optimizers taking care of FSDP and DeepSpeed nuances optimizers = [] if self.distributed_type == DistributedType.FSDP: - for opt in self._optimizers: + for i, opt in enumerate(self._optimizers): logger.info("Saving FSDP Optimizer") save_fsdp_optimizer(self.state.fsdp_plugin, self, opt, self._models[i], output_dir, i) logger.info(f"FSDP Optimizer saved to output dir {output_dir}") @@ -3068,6 +2985,13 @@ def get_state_dict(self, model, unwrap=True): from deepspeed.checkpoint.utils import clone_tensors_for_torch_save state_dict = clone_tensors_for_torch_save(self.unwrap_model(model).state_dict()) + elif self.distributed_type == DistributedType.FSDP: + from torch.distributed.fsdp import FullStateDictConfig, StateDictType + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + + full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) + with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, full_state_dict_config): + state_dict = model.state_dict() else: if unwrap: model = self.unwrap_model(model) diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py index 1331e7fe43c..85d13d19cc5 100644 --- a/src/accelerate/commands/config/cluster.py +++ b/src/accelerate/commands/config/cluster.py @@ -381,9 +381,9 @@ def get_cluster_input(): error_message="Please enter yes or no.", ) fsdp_config["fsdp_use_orig_params"] = _ask_field( - "Do you want to enable FSDP's `use_orig_params` feature? [yes/NO]: ", + "Do you want to enable FSDP's `use_orig_params` feature? [YES/no]: ", _convert_yes_no_to_bool, - default=False, + default=True, error_message="Please enter yes or no.", ) fsdp_config["fsdp_cpu_ram_efficient_loading"] = _ask_field( diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 2dfc4fdb7ee..8e44919b23d 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -519,7 +519,7 @@ def launch_command_parser(subparsers=None): ) fsdp_args.add_argument( "--fsdp_use_orig_params", - default="false", + default="true", type=str, help="If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres." " (useful only when `use_fsdp` flag is passed).", diff --git a/src/accelerate/utils/constants.py b/src/accelerate/utils/constants.py index 843eb5756af..c17487ade01 100644 --- a/src/accelerate/utils/constants.py +++ b/src/accelerate/utils/constants.py @@ -34,7 +34,8 @@ FSDP_AUTO_WRAP_POLICY = ["TRANSFORMER_BASED_WRAP", "SIZE_BASED_WRAP", "NO_WRAP"] FSDP_BACKWARD_PREFETCH = ["BACKWARD_PRE", "BACKWARD_POST", "NO_PREFETCH"] FSDP_STATE_DICT_TYPE = ["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] -FSDP_PYTORCH_VERSION = "2.0.1" +FSDP_PYTORCH_VERSION = "2.1.0" +FSDP_MODEL_NAME = "pytorch_model_fsdp" DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich", "mpich"] TORCH_DYNAMO_MODES = ["default", "reduce-overhead", "max-autotune"] diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py index 2d22d460aee..6bc51c399e3 100644 --- a/src/accelerate/utils/dataclasses.py +++ b/src/accelerate/utils/dataclasses.py @@ -870,7 +870,7 @@ class FullyShardedDataParallelPlugin: }, ) use_orig_params: bool = field( - default=False, + default=True, metadata={ "help": "If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres. " "Useful in cases such as parameter-efficient fine-tuning. " diff --git a/src/accelerate/utils/fsdp_utils.py b/src/accelerate/utils/fsdp_utils.py index 827b9ffd99c..edff9dec604 100644 --- a/src/accelerate/utils/fsdp_utils.py +++ b/src/accelerate/utils/fsdp_utils.py @@ -16,7 +16,7 @@ import torch from ..logging import get_logger -from .constants import FSDP_PYTORCH_VERSION, MODEL_NAME, OPTIMIZER_NAME +from .constants import FSDP_MODEL_NAME, FSDP_PYTORCH_VERSION, OPTIMIZER_NAME from .imports import is_torch_distributed_available from .versions import is_torch_version @@ -47,7 +47,7 @@ def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0): ): state_dict = model.state_dict() if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT: - weights_name = f"{MODEL_NAME}.bin" if model_index == 0 else f"{MODEL_NAME}_{model_index}.bin" + weights_name = f"{FSDP_MODEL_NAME}.bin" if model_index == 0 else f"{FSDP_MODEL_NAME}_{model_index}.bin" output_model_file = os.path.join(output_dir, weights_name) if accelerator.process_index == 0: logger.info(f"Saving model to {output_model_file}") @@ -55,16 +55,16 @@ def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0): logger.info(f"Model saved to {output_model_file}") elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT: weights_name = ( - f"{MODEL_NAME}_rank{accelerator.process_index}.bin" + f"{FSDP_MODEL_NAME}_rank{accelerator.process_index}.bin" if model_index == 0 - else f"{MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin" + else f"{FSDP_MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin" ) output_model_file = os.path.join(output_dir, weights_name) logger.info(f"Saving model to {output_model_file}") torch.save(state_dict, output_model_file) logger.info(f"Model saved to {output_model_file}") elif fsdp_plugin.state_dict_type == StateDictType.SHARDED_STATE_DICT: - ckpt_dir = os.path.join(output_dir, f"{MODEL_NAME}_{model_index}") + ckpt_dir = os.path.join(output_dir, f"{FSDP_MODEL_NAME}_{model_index}") os.makedirs(ckpt_dir, exist_ok=True) logger.info(f"Saving model to {ckpt_dir}") state_dict = {"model": state_dict} @@ -96,16 +96,16 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0): "initializing FSDP object" ) return - weights_name = f"{MODEL_NAME}.bin" if model_index == 0 else f"{MODEL_NAME}_{model_index}.bin" + weights_name = f"{FSDP_MODEL_NAME}.bin" if model_index == 0 else f"{FSDP_MODEL_NAME}_{model_index}.bin" input_model_file = os.path.join(input_dir, weights_name) logger.info(f"Loading model from {input_model_file}") state_dict = torch.load(input_model_file) logger.info(f"Model loaded from {input_model_file}") elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT: weights_name = ( - f"{MODEL_NAME}_rank{accelerator.process_index}.bin" + f"{FSDP_MODEL_NAME}_rank{accelerator.process_index}.bin" if model_index == 0 - else f"{MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin" + else f"{FSDP_MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin" ) input_model_file = os.path.join(input_dir, weights_name) logger.info(f"Loading model from {input_model_file}") @@ -113,8 +113,8 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0): logger.info(f"Model loaded from {input_model_file}") elif fsdp_plugin.state_dict_type == StateDictType.SHARDED_STATE_DICT: ckpt_dir = ( - os.path.join(input_dir, f"{MODEL_NAME}_{model_index}") - if f"{MODEL_NAME}" not in input_dir + os.path.join(input_dir, f"{FSDP_MODEL_NAME}_{model_index}") + if f"{FSDP_MODEL_NAME}" not in input_dir else input_dir ) logger.info(f"Loading model from {ckpt_dir}") @@ -164,16 +164,14 @@ def load_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, input_dir, o ): if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT: optim_state = None - # below check should work but currently it isn't working (mostly opytorch issue), - # in the meantime disabling it at the cost of excess memory usage - # if accelerator.process_index == 0 or not fsdp_plugin.optim_state_dict_config.rank0_only: - optimizer_name = ( - f"{OPTIMIZER_NAME}.bin" if optimizer_index == 0 else f"{OPTIMIZER_NAME}_{optimizer_index}.bin" - ) - input_optimizer_file = os.path.join(input_dir, optimizer_name) - logger.info(f"Loading Optimizer state from {input_optimizer_file}") - optim_state = torch.load(input_optimizer_file) - logger.info(f"Optimizer state loaded from {input_optimizer_file}") + if accelerator.process_index == 0 or not fsdp_plugin.optim_state_dict_config.rank0_only: + optimizer_name = ( + f"{OPTIMIZER_NAME}.bin" if optimizer_index == 0 else f"{OPTIMIZER_NAME}_{optimizer_index}.bin" + ) + input_optimizer_file = os.path.join(input_dir, optimizer_name) + logger.info(f"Loading Optimizer state from {input_optimizer_file}") + optim_state = torch.load(input_optimizer_file) + logger.info(f"Optimizer state loaded from {input_optimizer_file}") else: ckpt_dir = ( os.path.join(input_dir, f"{OPTIMIZER_NAME}_{optimizer_index}") diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py index 7b87f61f471..244bedf4d82 100644 --- a/tests/fsdp/test_fsdp.py +++ b/tests/fsdp/test_fsdp.py @@ -252,6 +252,11 @@ def test_checkpointing(self): continue state_dict_config_index = len(cmd_config) for state_dict_type in FSDP_STATE_DICT_TYPE: + # Todo: Currently failing for `LOCAL_STATE_DICT` with error + # Unexpected key(s) in state_dict: "_fsdp_wrapped_module._flat_param". + if state_dict_type == "LOCAL_STATE_DICT": + continue + cmd_config = cmd_config[:state_dict_config_index] cmd_config.append(f"--fsdp_state_dict_type={state_dict_type}") cmd_config.extend( From 5fc1b230d339c6e77179adfe2b74a6b414c9cbbf Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Tue, 28 Nov 2023 13:34:11 -0500 Subject: [PATCH 23/24] Pin DVC (#2196) * Remove dvc * Pin instead --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b3a8fda47bf..f7369d7df97 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ extras["testing"] = extras["test_prod"] + extras["test_dev"] extras["rich"] = ["rich"] -extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard", "dvclive"] +extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard", "dvclive", "dvc<=3.30.1"] extras["dev"] = extras["quality"] + extras["testing"] + extras["rich"] extras["sagemaker"] = [ From b04d36c75f701266048382426b4074e28bfdb67c Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Tue, 28 Nov 2023 15:02:20 -0500 Subject: [PATCH 24/24] Apply DVC warning to Accelerate (#2197) * Use logger warn instead * Warn * Right import * Clean up logs * Apply suggestions from code review Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- src/accelerate/logging.py | 12 ++++++++++++ src/accelerate/tracking.py | 20 +++++++++++++++----- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/src/accelerate/logging.py b/src/accelerate/logging.py index d553b9a993c..ebb8c1eb830 100644 --- a/src/accelerate/logging.py +++ b/src/accelerate/logging.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools import logging import os @@ -67,6 +68,17 @@ def log(self, level, msg, *args, **kwargs): self.logger.log(level, msg, *args, **kwargs) state.wait_for_everyone() + @functools.lru_cache(None) + def warning_once(self, *args, **kwargs): + """ + This method is identical to `logger.warning()`, but will emit the warning with the same message only once + + Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the + cache. The assumption here is that all warning messages are unique across the code. If they aren't then need to + switch to another type of cache that includes the caller frame information in the hashing function. + """ + self.warning(*args, **kwargs) + def get_logger(name: str, log_level: str = None): """ diff --git a/src/accelerate/tracking.py b/src/accelerate/tracking.py index 711f616b73e..7276f552aaf 100644 --- a/src/accelerate/tracking.py +++ b/src/accelerate/tracking.py @@ -640,8 +640,8 @@ def store_init_configuration(self, values: dict): for name, value in list(values.items()): # internally, all values are converted to str in MLflow if len(str(value)) > mlflow.utils.validation.MAX_PARAM_VAL_LENGTH: - logger.warning( - f'Trainer is attempting to log a value of "{value}" for key "{name}" as a parameter. MLflow\'s' + logger.warning_once( + f'Accelerate is attempting to log a value of "{value}" for key "{name}" as a parameter. MLflow\'s' f" log_param() only accepts values no longer than {mlflow.utils.validation.MAX_PARAM_VAL_LENGTH} characters so we dropped this attribute." ) del values[name] @@ -670,7 +670,7 @@ def log(self, values: dict, step: Optional[int]): if isinstance(v, (int, float)): metrics[k] = v else: - logger.warning( + logger.warning_once( f'MLflowTracker is attempting to log a value of "{v}" of type {type(v)} for key "{k}" as a metric. ' "MLflow's log_metric() only accepts float and int types so we dropped this attribute." ) @@ -755,7 +755,7 @@ def log(self, values: Dict[str, Union[int, float]], step: Optional[int] = None, clearml_logger = self.task.get_logger() for k, v in values.items(): if not isinstance(v, (int, float)): - logger.warning( + logger.warning_once( "Accelerator is attempting to log a value of " f'"{v}" of type {type(v)} for key "{k}" as a scalar. ' "This invocation of ClearML logger's report_scalar() " @@ -901,10 +901,20 @@ def log(self, values: dict, step: Optional[int] = None, **kwargs): kwargs: Additional key word arguments passed along to `dvclive.Live.log_metric()`. """ + from dvclive.plots import Metric + if step is not None: self.live.step = step for k, v in values.items(): - self.live.log_metric(k, v, **kwargs) + if Metric.could_log(v): + self.live.log_metric(k, v, **kwargs) + else: + logger.warning_once( + "Accelerator attempted to log a value of " + f'"{v}" of type {type(v)} for key "{k}" as a scalar. ' + "This invocation of DVCLive's Live.log_metric() " + "is incorrect so we dropped this attribute." + ) @on_main_process def finish(self):