From b55855a3d4057d1d70a299d5b0b3630ea79e63e4 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 14 Nov 2023 15:44:30 +0100
Subject: [PATCH 01/24] fix initial typos (#2150)

---
 docs/source/concept_guides/big_model_inference.md      | 2 +-
 docs/source/concept_guides/gradient_synchronization.md | 4 ++--
 docs/source/usage_guides/distributed_inference.md      | 2 +-
 docs/source/usage_guides/explore.md                    | 2 +-
 docs/source/usage_guides/megatron_lm.md                | 8 ++++----
 docs/source/usage_guides/training_zoo.md               | 2 +-
 src/accelerate/commands/config/cluster.py              | 2 +-
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/source/concept_guides/big_model_inference.md b/docs/source/concept_guides/big_model_inference.md
index ddce9114cdc..4e09adae686 100644
--- a/docs/source/concept_guides/big_model_inference.md
+++ b/docs/source/concept_guides/big_model_inference.md
@@ -154,7 +154,7 @@ By passing `device_map="auto"`, we tell 🤗 Accelerate to determine automatical
 #### `no_split_module_classes`
 
 This parameter will indicate that some of the modules with the name `"Block"` should not be split across different devices. You should set here all blocks that 
-include a residutal connection of some kind.
+include a residual connection of some kind.
 
 
 #### The `device_map`
diff --git a/docs/source/concept_guides/gradient_synchronization.md b/docs/source/concept_guides/gradient_synchronization.md
index 9010628ef7f..7ae8ab6853f 100644
--- a/docs/source/concept_guides/gradient_synchronization.md
+++ b/docs/source/concept_guides/gradient_synchronization.md
@@ -55,8 +55,8 @@ their gradients computed, collated, and updated before moving on to the next
 batch of data.
 When performing gradient accumulation, you accumulate `n` loss gradients and
 skip `optimizer.step()` until `n` batches have been reached. As all training
-processes only need to sychronize by the time `optimizer.step()` is called,
-without any modification to your training step, this neededless inter-process
+processes only need to synchronize by the time `optimizer.step()` is called,
+without any modification to your training step, this needless inter-process
 communication can cause a significant slowdown.
 
  How can you avoid this overhead?
diff --git a/docs/source/usage_guides/distributed_inference.md b/docs/source/usage_guides/distributed_inference.md
index 3bdd7121401..41053658482 100644
--- a/docs/source/usage_guides/distributed_inference.md
+++ b/docs/source/usage_guides/distributed_inference.md
@@ -51,7 +51,7 @@ def run_inference(rank, world_size):
 One will notice how we have to check the rank to know what prompt to send, which can be a bit tedious.
 
 A user might then also think that with 🤗 Accelerate, using the `Accelerator` to prepare a dataloader for such a task might also be 
-a simple way to manage this. (To learn more, check out the relvent section in the [Quick Tour](../quicktour#distributed-evaluation))
+a simple way to manage this. (To learn more, check out the relevant section in the [Quick Tour](../quicktour#distributed-evaluation))
 
 Can it manage it? Yes. Does it add unneeded extra code however: also yes.
 
diff --git a/docs/source/usage_guides/explore.md b/docs/source/usage_guides/explore.md
index 2b4decefa2a..533c4cf444f 100644
--- a/docs/source/usage_guides/explore.md
+++ b/docs/source/usage_guides/explore.md
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
 # Learning how to incorporate 🤗 Accelerate features quickly!
 
 Please use the interactive tool below to help you get started with learning about a particular 
-feature of 🤗 Accelerate and how to utilize it! It will provide you with a code diff, an explaination
+feature of 🤗 Accelerate and how to utilize it! It will provide you with a code diff, an explanation
 towards what is going on, as well as provide you with some useful links to explore more within
 the documentation!
 
diff --git a/docs/source/usage_guides/megatron_lm.md b/docs/source/usage_guides/megatron_lm.md
index 7b6822086da..25bea1f58d2 100644
--- a/docs/source/usage_guides/megatron_lm.md
+++ b/docs/source/usage_guides/megatron_lm.md
@@ -128,7 +128,7 @@ Do you want to enable Sequence Parallelism? [YES/no]:
 What is the Pipeline Parallelism degree/size? [1]:2
 What is the number of micro-batches? [1]:2
 Do you want to enable selective activation recomputation? [YES/no]: 
-Do you want to use distributed optimizer which shards optimizer state and gradients across data pralellel ranks? [YES/no]: 
+Do you want to use distributed optimizer which shards optimizer state and gradients across data parallel ranks? [YES/no]: 
 What is the gradient clipping value based on global L2 Norm (0 to disable)? [1.0]: 
 How many GPU(s) should be used for distributed training? [1]:4
 Do you wish to use FP16 or BF16 (mixed precision)? [NO/fp16/bf16]: bf16
@@ -355,8 +355,8 @@ def main():
 
 2. For using the Megatron-LM datasets, a few more changes are required. Dataloaders for these datasets
 are available only on rank 0 of each tensor parallel group. As such, there are rank where dataloader won't be
-avaiable and this requires tweaks to the training loop. Being able to do all this shows how
-felixble and extensible 🤗 Accelerate is. The changes required are as follows.
+available and this requires tweaks to the training loop. Being able to do all this shows how
+flexible and extensible 🤗 Accelerate is. The changes required are as follows.
 
 a. For Megatron-LM indexed datasets, we need to use `MegatronLMDummyDataLoader` 
 and pass the required dataset args to it such as `data_path`, `seq_length` etc. 
@@ -547,7 +547,7 @@ The `model(**batch_data)` call return loss(es) averaged across the data parallel
 This is fine for most cases wherein pre-training jobs are run using Megatron-LM features and
 you can easily compute the `perplexity` using the loss. 
 For GPT model, returning logits in addition to loss(es) is supported. 
-These logits aren't gathered across data prallel ranks. Use `accelerator.utils.gather_across_data_parallel_groups`
+These logits aren't gathered across data parallel ranks. Use `accelerator.utils.gather_across_data_parallel_groups`
 to gather logits across data parallel ranks. These logits along with labels can be used for computing various 
 performance metrics. 
 
diff --git a/docs/source/usage_guides/training_zoo.md b/docs/source/usage_guides/training_zoo.md
index 42dfe18a9f3..2a7f51d2873 100644
--- a/docs/source/usage_guides/training_zoo.md
+++ b/docs/source/usage_guides/training_zoo.md
@@ -15,7 +15,7 @@ rendered properly in your Markdown viewer.
 
 # Example Zoo
 
-Below contains a non-exhuastive list of tutorials and scripts showcasing 🤗 Accelerate
+Below contains a non-exhaustive list of tutorials and scripts showcasing 🤗 Accelerate
 
 ## Official Accelerate Examples:
 
diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
index 1090d17ddc3..1331e7fe43c 100644
--- a/src/accelerate/commands/config/cluster.py
+++ b/src/accelerate/commands/config/cluster.py
@@ -451,7 +451,7 @@ def get_cluster_input():
 
             megatron_lm_config[prefix + "use_distributed_optimizer"] = _ask_field(
                 "Do you want to use distributed optimizer "
-                "which shards optimizer state and gradients across data pralellel ranks? [YES/no]: ",
+                "which shards optimizer state and gradients across data parallel ranks? [YES/no]: ",
                 _convert_yes_no_to_bool,
                 default=True,
                 error_message="Please enter yes or no.",

From 8dedb140ef8995b4ff6f4b0e2452369a0ab1a969 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 14 Nov 2023 11:53:57 -0500
Subject: [PATCH 02/24] Add note about GradientState being in-sync with the
 dataloader by default (#2134)

* NOte about sync

* PR review comments
---
 .../usage_guides/gradient_accumulation.md      | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/docs/source/usage_guides/gradient_accumulation.md b/docs/source/usage_guides/gradient_accumulation.md
index 54863015d8b..7960e6b0e4c 100644
--- a/docs/source/usage_guides/gradient_accumulation.md
+++ b/docs/source/usage_guides/gradient_accumulation.md
@@ -118,8 +118,24 @@ You can remove all the special checks for the step number and the loss adjustmen
 As you can see the [`Accelerator`] is able to keep track of the batch number you are on and it will automatically know whether to step through the prepared optimizer and how to adjust the loss. 
 
 <Tip>
+
 Typically with gradient accumulation, you would need to adjust the number of steps to reflect the change in total batches you are 
-training on. 🤗 Accelerate automagically does this for you by default. Behind the scenes we instantiate a GradientAccumulationPlugin configured to do this.
+training on. 🤗 Accelerate automagically does this for you by default. Behind the scenes we instantiate a [`GradientAccumulationPlugin`] configured to do this.
+
+</Tip>
+
+<Tip warning={true}>
+
+The [`state.GradientState`] is sync'd with the active dataloader being iterated upon. As such it assumes naively that when we have reached the end of the dataloader everything will sync and a step will be performed. To disable this, set `sync_with_dataloader` to be `False` in the [`GradientAccumulationPlugin`]:
+
+```{python}
+from accelerate import Accelerator
+from accelerate.utils import GradientAccumulationPlugin
+
+plugin = GradientAccumulationPlugin(sync_with_dataloader=False)
+accelerator = Accelerator(..., gradient_accumulation_plugin=plugin)
+```
+
 </Tip>
 
 ## The finished code

From e9fd72a61365d8a5c41e3b7260ea4ec2d0a23053 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 14 Nov 2023 14:42:01 -0500
Subject: [PATCH 03/24] Deprecated stuff (#2152)

---
 .github/workflows/build-docker-images-release.yml | 2 +-
 .github/workflows/quality.yml                     | 2 +-
 .github/workflows/stale.yml                       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-docker-images-release.yml b/.github/workflows/build-docker-images-release.yml
index 2d3a8a6fbfb..8d5f2194b78 100644
--- a/.github/workflows/build-docker-images-release.yml
+++ b/.github/workflows/build-docker-images-release.yml
@@ -15,7 +15,7 @@ jobs:
     outputs:
       version: ${{ steps.step1.outputs.version }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v3.1.0
       - id: step1
         run: echo "version=$(python setup.py --version)" >> $GITHUB_OUTPUT
 
diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
index 0060c7635f7..ac1a463b0bd 100644
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/quality.yml
@@ -6,7 +6,7 @@ jobs:
   quality:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3.1.0
     - name: Set up Python 3.8
       uses: actions/setup-python@v3
       with:
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index f79ecf7a3bd..39ca1384783 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,10 +13,10 @@ jobs:
     env:
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3.1.0
     
     - name: Setup Python
-      uses: actions/setup-python@v1
+      uses: actions/setup-python@v3
       with:
         python-version: 3.8
     

From a912b2ee095910c5df58dd426ac612ced1e5b173 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 14 Nov 2023 15:03:41 -0500
Subject: [PATCH 04/24] Add examples to tests (#2131)

* Add examples to tests

* Try now

* Right name

* Right path

* Fin

* Too slow, just test on runner
---
 .github/workflows/self_hosted_integration_tests.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml
index 7b4d8f6b813..dc8b49cd38f 100644
--- a/.github/workflows/self_hosted_integration_tests.yml
+++ b/.github/workflows/self_hosted_integration_tests.yml
@@ -81,6 +81,16 @@ jobs:
           source activate accelerate;
           pytest -sv tests/deepspeed
 
+      - name: Run transformers examples tests
+        working-directory: transformers/examples/pytorch
+        env:
+          CUDA_VISIBLE_DEVICES: ${{ matrix.cuda_visible_devices }}
+          WANDB_DISABLED: true
+        run: |
+          pip install -r _tests_requirements.txt
+          cd ../../
+          pytest -sv examples/pytorch/test_accelerate_examples.py examples/pytorch/test_pytorch_examples.py
+
   run-skorch-tests:
     container:
       image: huggingface/accelerate-gpu:latest

From 0f2686c8d3e6d949c4b7efa15d7f2dee44f7ce91 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 15 Nov 2023 11:29:39 -0500
Subject: [PATCH 05/24] Disable pypi for merge workflows + fix trainer tests
 (#2153)

* Disable workflows for PR + merge

* skorch

* Fix transformers tests too
---
 .github/workflows/integration_tests.yml       |  8 -------
 .../self_hosted_integration_tests.yml         | 24 ++++---------------
 2 files changed, 5 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
index e7bff0d60ca..68085ff9b21 100644
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -25,11 +25,6 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
-      matrix:
-        transformers-version: [
-          pypi,
-          github
-        ]
     steps:
     - uses: actions/checkout@v3.1.0
     - name: Set up python 3.8
@@ -47,9 +42,6 @@ jobs:
         cd ..
         git clone https://github.com/huggingface/transformers
         cd transformers
-        if [[ ${{ matrix.transformers-version }} = pypi ]]; then 
-          git checkout $(git describe --tags `git rev-list --tags --max-count=1`)
-        fi
         pip install .[torch,testing]
 
     - name: Show installed libraries
diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml
index dc8b49cd38f..94e50e61ff3 100644
--- a/.github/workflows/self_hosted_integration_tests.yml
+++ b/.github/workflows/self_hosted_integration_tests.yml
@@ -29,10 +29,6 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        transformers-version: [
-          pypi,
-          github
-        ]
         cuda_visible_devices: [
           "0", 
           "0,1"
@@ -51,11 +47,9 @@ jobs:
         run: |
           source activate accelerate
           git config --global --add safe.directory '*'
-          git checkout main && git pull && git fetch --tags
-          if [[ ${{ matrix.transformers-version }} = pypi ]]; then 
-            git checkout $(git tag --sort=taggerdate | tail -1)
-          fi
+          git checkout main && git pull
           pip install .[torch,deepspeed-testing]
+          pip uninstall comet_ml wandb -y
       
       - name: Show installed libraries
         run: |
@@ -82,13 +76,13 @@ jobs:
           pytest -sv tests/deepspeed
 
       - name: Run transformers examples tests
-        working-directory: transformers/examples/pytorch
+        working-directory: transformers/
         env:
           CUDA_VISIBLE_DEVICES: ${{ matrix.cuda_visible_devices }}
           WANDB_DISABLED: true
         run: |
-          pip install -r _tests_requirements.txt
-          cd ../../
+          source activate accelerate
+          pip install -r examples/pytorch/_tests_requirements.txt
           pytest -sv examples/pytorch/test_accelerate_examples.py examples/pytorch/test_pytorch_examples.py
 
   run-skorch-tests:
@@ -98,11 +92,6 @@ jobs:
     runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
     strategy:
       fail-fast: false
-      matrix:
-        skorch-version: [
-          pypi,
-          github
-        ]
     steps:
       - name: Update accelerate clone and pip install
         working-directory: accelerate/
@@ -118,9 +107,6 @@ jobs:
           source activate accelerate
           git config --global --add safe.directory '*'
           git checkout master && git pull
-          if [[ ${{ matrix.skorch-version }} = pypi ]]; then 
-            git checkout $(git describe --tags `git rev-list --tags --max-count=1`)
-          fi
           pip install .[testing]
           pip install flaky
 

From 99877f56d6d77f38f031d0bfac40c0d2a409f5b8 Mon Sep 17 00:00:00 2001
From: Dave Berenbaum <dave.berenbaum@gmail.com>
Date: Fri, 17 Nov 2023 08:49:13 -0500
Subject: [PATCH 06/24] Adds dvclive tracker (#2139)

* dvclive tracker

* add dvclive to test_trackers

* fix dvclive tests

* add dvclive example and respond to other feedback

* fix dvclive tests

* fix quality
---
 docs/source/usage_guides/tracking.md          |  3 +-
 .../deepspeed_with_config_support.py          |  2 +-
 .../by_feature/megatron_lm_gpt_pretraining.py |  2 +-
 setup.py                                      |  2 +-
 src/accelerate/test_utils/testing.py          |  8 ++
 src/accelerate/tracking.py                    | 79 +++++++++++++++++++
 src/accelerate/utils/__init__.py              |  1 +
 src/accelerate/utils/dataclasses.py           |  2 +
 src/accelerate/utils/imports.py               |  4 +
 tests/test_examples.py                        |  2 +-
 tests/test_tracking.py                        | 51 +++++++++++-
 11 files changed, 150 insertions(+), 6 deletions(-)

diff --git a/docs/source/usage_guides/tracking.md b/docs/source/usage_guides/tracking.md
index 141fea6924b..dba4b084d5d 100644
--- a/docs/source/usage_guides/tracking.md
+++ b/docs/source/usage_guides/tracking.md
@@ -20,7 +20,7 @@ There are a large number of experiment tracking API's available, however getting
 
 ## Integrated Trackers
 
-Currently `Accelerate` supports six trackers out-of-the-box:
+Currently `Accelerate` supports seven trackers out-of-the-box:
 
 - TensorBoard
 - WandB
@@ -28,6 +28,7 @@ Currently `Accelerate` supports six trackers out-of-the-box:
 - Aim
 - MLFlow
 - ClearML
+- DVCLive
 
 To use any of them, pass in the selected type(s) to the `log_with` parameter in [`Accelerate`]:
 ```python
diff --git a/examples/by_feature/deepspeed_with_config_support.py b/examples/by_feature/deepspeed_with_config_support.py
index 15e810c4a2e..b5f122f3ad1 100755
--- a/examples/by_feature/deepspeed_with_config_support.py
+++ b/examples/by_feature/deepspeed_with_config_support.py
@@ -220,7 +220,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"`, and `"dvclive"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/examples/by_feature/megatron_lm_gpt_pretraining.py b/examples/by_feature/megatron_lm_gpt_pretraining.py
index 3c048b2600e..b0e1b33700f 100644
--- a/examples/by_feature/megatron_lm_gpt_pretraining.py
+++ b/examples/by_feature/megatron_lm_gpt_pretraining.py
@@ -216,7 +216,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"`, and `"dvclive"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/setup.py b/setup.py
index f6eefda0dea..b3a8fda47bf 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@
 extras["testing"] = extras["test_prod"] + extras["test_dev"]
 extras["rich"] = ["rich"]
 
-extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard"]
+extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard", "dvclive"]
 extras["dev"] = extras["quality"] + extras["testing"] + extras["rich"]
 
 extras["sagemaker"] = [
diff --git a/src/accelerate/test_utils/testing.py b/src/accelerate/test_utils/testing.py
index d6d1e2f2f0a..8a8b82f4e34 100644
--- a/src/accelerate/test_utils/testing.py
+++ b/src/accelerate/test_utils/testing.py
@@ -35,6 +35,7 @@
     is_comet_ml_available,
     is_datasets_available,
     is_deepspeed_available,
+    is_dvclive_available,
     is_mps_available,
     is_pandas_available,
     is_tensorboard_available,
@@ -231,6 +232,13 @@ def require_clearml(test_case):
     return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case)
 
 
+def require_dvclive(test_case):
+    """
+    Decorator marking a test that requires dvclive installed. These tests are skipped when dvclive isn't installed
+    """
+    return unittest.skipUnless(is_dvclive_available(), "test requires dvclive")(test_case)
+
+
 def require_pandas(test_case):
     """
     Decorator marking a test that requires pandas installed. These tests are skipped when pandas isn't installed
diff --git a/src/accelerate/tracking.py b/src/accelerate/tracking.py
index 4f536d57812..711f616b73e 100644
--- a/src/accelerate/tracking.py
+++ b/src/accelerate/tracking.py
@@ -30,6 +30,7 @@
     is_aim_available,
     is_clearml_available,
     is_comet_ml_available,
+    is_dvclive_available,
     is_mlflow_available,
     is_tensorboard_available,
     is_wandb_available,
@@ -57,6 +58,9 @@
 if is_clearml_available():
     _available_trackers.append(LoggerType.CLEARML)
 
+if is_dvclive_available():
+    _available_trackers.append(LoggerType.DVCLIVE)
+
 logger = get_logger(__name__)
 
 
@@ -837,6 +841,79 @@ def _get_title_series(name):
         return name, "train"
 
 
+class DVCLiveTracker(GeneralTracker):
+    """
+    A `Tracker` class that supports `dvclive`. Should be initialized at the start of your script.
+
+    Args:
+        run_name (`str`, *optional*):
+            Ignored for dvclive. See `kwargs` instead.
+        kwargs:
+            Additional key word arguments passed along to [`dvclive.Live()`](https://dvc.org/doc/dvclive/live).
+
+    Example:
+
+    ```py
+    from accelerate import Accelerator
+
+    accelerator = Accelerator(log_with="dvclive")
+    accelerator.init_trackers(project_name="my_project", init_kwargs={"dvclive": {"dir": "my_directory"}})
+    ```
+    """
+
+    name = "dvclive"
+    requires_logging_directory = False
+
+    @on_main_process
+    def __init__(self, run_name: Optional[str] = None, live: Optional[Any] = None, **kwargs):
+        from dvclive import Live
+
+        super().__init__()
+        self.live = live if live is not None else Live(**kwargs)
+
+    @property
+    def tracker(self):
+        return self.live
+
+    @on_main_process
+    def store_init_configuration(self, values: dict):
+        """
+        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment. Stores the
+        hyperparameters in a yaml file for future use.
+
+        Args:
+            values (Dictionary `str` to `bool`, `str`, `float`, `int`, or a List or Dict of those types):
+                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
+                `str`, `float`, or `int`.
+        """
+        self.live.log_params(values)
+
+    @on_main_process
+    def log(self, values: dict, step: Optional[int] = None, **kwargs):
+        """
+        Logs `values` to the current run.
+
+        Args:
+            values (Dictionary `str` to `str`, `float`, or `int`):
+                Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to `dvclive.Live.log_metric()`.
+        """
+        if step is not None:
+            self.live.step = step
+        for k, v in values.items():
+            self.live.log_metric(k, v, **kwargs)
+
+    @on_main_process
+    def finish(self):
+        """
+        Closes `dvclive.Live()`.
+        """
+        self.live.end()
+
+
 LOGGER_TYPE_TO_CLASS = {
     "aim": AimTracker,
     "comet_ml": CometMLTracker,
@@ -844,6 +921,7 @@ def _get_title_series(name):
     "tensorboard": TensorBoardTracker,
     "wandb": WandBTracker,
     "clearml": ClearMLTracker,
+    "dvclive": DVCLiveTracker,
 }
 
 
@@ -866,6 +944,7 @@ def filter_trackers(
             - `"wandb"`
             - `"comet_ml"`
             - `"mlflow"`
+            - `"dvclive"`
             If `"all"` is selected, will pick up all available trackers in the environment and initialize them. Can
             also accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
         logging_dir (`str`, `os.PathLike`, *optional*):
diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
index 88cc927f001..fa15b173ed3 100644
--- a/src/accelerate/utils/__init__.py
+++ b/src/accelerate/utils/__init__.py
@@ -52,6 +52,7 @@
     is_cuda_available,
     is_datasets_available,
     is_deepspeed_available,
+    is_dvclive_available,
     is_fp8_available,
     is_ipex_available,
     is_megatron_lm_available,
diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
index 72f3c9aeb2d..e0e41568b0c 100644
--- a/src/accelerate/utils/dataclasses.py
+++ b/src/accelerate/utils/dataclasses.py
@@ -340,6 +340,7 @@ class LoggerType(BaseEnum):
         - **TENSORBOARD** -- TensorBoard as an experiment tracker
         - **WANDB** -- wandb as an experiment tracker
         - **COMETML** -- comet_ml as an experiment tracker
+        - **DVCLIVE** -- dvclive as an experiment tracker
     """
 
     ALL = "all"
@@ -349,6 +350,7 @@ class LoggerType(BaseEnum):
     COMETML = "comet_ml"
     MLFLOW = "mlflow"
     CLEARML = "clearml"
+    DVCLIVE = "dvclive"
 
 
 class PrecisionType(BaseEnum):
diff --git a/src/accelerate/utils/imports.py b/src/accelerate/utils/imports.py
index 9a60233c96c..27389eab107 100644
--- a/src/accelerate/utils/imports.py
+++ b/src/accelerate/utils/imports.py
@@ -297,3 +297,7 @@ def is_xpu_available(check_device=False):
         except RuntimeError:
             return False
     return hasattr(torch, "xpu") and torch.xpu.is_available()
+
+
+def is_dvclive_available():
+    return _is_package_available("dvclive")
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 0426fc645b6..4b697e12132 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -205,7 +205,7 @@ def test_multi_process_metrics(self):
         run_command(self._launch_args + testargs)
 
     @require_trackers
-    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
     def test_tracking(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             testargs = f"""
diff --git a/tests/test_tracking.py b/tests/test_tracking.py
index 545b51fefd4..58709546ea5 100644
--- a/tests/test_tracking.py
+++ b/tests/test_tracking.py
@@ -35,13 +35,19 @@
     TempDirTestCase,
     require_clearml,
     require_comet_ml,
+    require_dvclive,
     require_pandas,
     require_tensorboard,
     require_wandb,
     skip,
 )
 from accelerate.tracking import CometMLTracker, GeneralTracker
-from accelerate.utils import ProjectConfiguration, is_comet_ml_available, is_tensorboard_available
+from accelerate.utils import (
+    ProjectConfiguration,
+    is_comet_ml_available,
+    is_dvclive_available,
+    is_tensorboard_available,
+)
 
 
 if is_comet_ml_available():
@@ -52,6 +58,11 @@
 
     import tensorboard.compat.proto.event_pb2 as event_pb2
 
+if is_dvclive_available():
+    from dvclive.plots.metric import Metric
+    from dvclive.serialize import load_yaml
+    from dvclive.utils import parse_metrics
+
 logger = logging.getLogger(__name__)
 
 
@@ -473,3 +484,41 @@ def test_log(self):
                     "some_string": "",
                 }
                 self.assertDictEqual(data, truth)
+
+
+@require_dvclive
+@mock.patch("dvclive.live.get_dvc_repo", return_value=None)
+class DVCLiveTrackingTest(unittest.TestCase):
+    def test_init_trackers(self, mock_repo):
+        project_name = "test_project_with_config"
+        with tempfile.TemporaryDirectory() as dirpath:
+            accelerator = Accelerator(log_with="dvclive")
+            config = {
+                "num_iterations": 12,
+                "learning_rate": 1e-2,
+                "some_boolean": False,
+                "some_string": "some_value",
+            }
+            init_kwargs = {"dvclive": {"dir": dirpath, "save_dvc_exp": False, "dvcyaml": None}}
+            accelerator.init_trackers(project_name, config, init_kwargs)
+            accelerator.end_training()
+            live = accelerator.trackers[0].live
+            params = load_yaml(live.params_file)
+            assert params == config
+
+    def test_log(self, mock_repo):
+        project_name = "test_project_with_log"
+        with tempfile.TemporaryDirectory() as dirpath:
+            accelerator = Accelerator(log_with="dvclive", project_dir=dirpath)
+            init_kwargs = {"dvclive": {"dir": dirpath, "save_dvc_exp": False, "dvcyaml": None}}
+            accelerator.init_trackers(project_name, init_kwargs=init_kwargs)
+            values = {"total_loss": 0.1, "iteration": 1, "my_text": "some_value"}
+            accelerator.log(values, step=0)
+            accelerator.end_training()
+            live = accelerator.trackers[0].live
+            logs, latest = parse_metrics(live)
+            assert latest == values
+            scalars = os.path.join(live.plots_dir, Metric.subfolder)
+            assert os.path.join(scalars, "total_loss.tsv") in logs
+            assert os.path.join(scalars, "iteration.tsv") in logs
+            assert os.path.join(scalars, "my_text.tsv") in logs

From cf745c936d541c538f2d1dfda12b2d5c0a2715f8 Mon Sep 17 00:00:00 2001
From: Jingru <niejingru@hotmail.com>
Date: Fri, 17 Nov 2023 22:00:55 +0800
Subject: [PATCH 07/24] check port availability only in main deepspeed/torchrun
 launcher (#2078)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* check port availability only in main deepspeed launcher

* check port availability only in main launcher for deepspeed/torchrun

* Update launch.py

add comments

---------

Co-authored-by: 聂靖入 <niejingru@bytedance.com>
---
 src/accelerate/utils/launch.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py
index 133d55364ad..a299343d90b 100644
--- a/src/accelerate/utils/launch.py
+++ b/src/accelerate/utils/launch.py
@@ -128,7 +128,10 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> Dict[str, str]:
     if main_process_port is None:
         main_process_port = 29500
 
-    if is_port_in_use(main_process_port):
+    # only need to check port availability in main process, in case we have to start multiple launchers on the same machine
+    # for some reasons like splitting log files.
+    need_port_check = num_machines <= 1 or int(args.machine_rank) == 0
+    if need_port_check and is_port_in_use(main_process_port):
         raise ConnectionError(
             f"Tried to launch distributed communication on port `{main_process_port}`, but another process is utilizing it. "
             "Please specify a different port (such as using the `----main_process_port` flag or specifying a different `main_process_port` in your config file)"
@@ -272,7 +275,10 @@ def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> Tuple[List[str], Dict
     if main_process_port is None:
         main_process_port = 29500
 
-    if is_port_in_use(main_process_port):
+    # only need to check port availability in main process, in case we have to start multiple launchers on the same machine
+    # for some reasons like splitting log files.
+    need_port_check = num_machines <= 1 or int(args.machine_rank) == 0
+    if need_port_check and is_port_in_use(main_process_port):
         raise ConnectionError(
             f"Tried to launch distributed communication on port `{main_process_port}`, but another process is utilizing it. "
             "Please specify a different port (such as using the `----main_process_port` flag or specifying a different `main_process_port` in your config file)"

From a5a7c039a05424a5715f7f829210245fe3165812 Mon Sep 17 00:00:00 2001
From: Frankie Robertson <frankier@users.noreply.github.com>
Date: Fri, 17 Nov 2023 16:01:35 +0200
Subject: [PATCH 08/24] Do not attempt to pad nested tensors (#2041)

---
 src/accelerate/utils/__init__.py   |  1 +
 src/accelerate/utils/operations.py | 11 +++++++++++
 tests/test_utils.py                | 11 +++++++++++
 3 files changed, 23 insertions(+)

diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
index fa15b173ed3..497c14abe28 100644
--- a/src/accelerate/utils/__init__.py
+++ b/src/accelerate/utils/__init__.py
@@ -102,6 +102,7 @@
     save_offload_index,
 )
 from .operations import (
+    CannotPadNestedTensorWarning,
     broadcast,
     broadcast_object_list,
     concatenate,
diff --git a/src/accelerate/utils/operations.py b/src/accelerate/utils/operations.py
index 267f7809ef0..5d1df1d995c 100644
--- a/src/accelerate/utils/operations.py
+++ b/src/accelerate/utils/operations.py
@@ -17,6 +17,7 @@
 """
 
 import pickle
+import warnings
 from functools import update_wrapper, wraps
 from typing import Any, Mapping
 
@@ -525,6 +526,10 @@ def concatenate(data, dim=0):
     return torch.cat(data, dim=dim)
 
 
+class CannotPadNestedTensorWarning(UserWarning):
+    pass
+
+
 @chained_operation
 def pad_across_processes(tensor, dim=0, pad_index=0, pad_first=False):
     """
@@ -543,6 +548,12 @@ def pad_across_processes(tensor, dim=0, pad_index=0, pad_first=False):
     """
 
     def _pad_across_processes(tensor, dim=0, pad_index=0, pad_first=False):
+        if getattr(tensor, "is_nested", False):
+            warnings.warn(
+                "Cannot pad nested tensors without more information. Leaving unprocessed.",
+                CannotPadNestedTensorWarning,
+            )
+            return tensor
         if dim >= len(tensor.shape):
             return tensor
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index fa23e72986d..239214bfc3c 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -27,11 +27,13 @@
 from accelerate.test_utils.testing import require_cuda, require_torch_min_version
 from accelerate.test_utils.training import RegressionModel
 from accelerate.utils import (
+    CannotPadNestedTensorWarning,
     check_os_kernel,
     convert_outputs_to_fp32,
     extract_model_from_parallel,
     find_device,
     listify,
+    pad_across_processes,
     patch_environment,
     recursively_apply,
     save,
@@ -226,3 +228,12 @@ def forward(self, x):
                 save(model.state_dict(), save_path, safe_serialization=True)
                 self.assertEqual(len(log.records), 1)
                 self.assertIn("Removed shared tensor", log.output[0])
+
+    @require_torch_min_version(version="1.12")
+    def test_pad_across_processes(self):
+        from torch.nested import nested_tensor
+
+        nt = nested_tensor([[1, 2, 3], [1], [1, 2]])
+        with self.assertWarns(CannotPadNestedTensorWarning):
+            nt2 = pad_across_processes(nt)
+        self.assertIs(nt, nt2)

From cd515812483aedd070933c3367366a0b5ef43daa Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Fri, 17 Nov 2023 09:24:20 -0500
Subject: [PATCH 09/24] Add warning for problematic libraries (#2151)

* Test bnb and fix nb launcher skip

* Fin

* Rm comment

* PR Review comments

* Just star
---
 src/accelerate/launchers.py                   | 13 +++++++-
 .../test_utils/scripts/test_notebook.py       | 33 ++++++++++++++-----
 src/accelerate/test_utils/testing.py          |  6 ++--
 src/accelerate/utils/__init__.py              |  8 ++++-
 src/accelerate/utils/environment.py           |  9 +++++
 tests/test_multigpu.py                        | 23 ++++++-------
 6 files changed, 67 insertions(+), 25 deletions(-)

diff --git a/src/accelerate/launchers.py b/src/accelerate/launchers.py
index 310e52c313d..0e32d84d06d 100644
--- a/src/accelerate/launchers.py
+++ b/src/accelerate/launchers.py
@@ -19,7 +19,7 @@
 import torch
 
 from .state import AcceleratorState, PartialState
-from .utils import PrecisionType, PrepareForLaunch, is_mps_available, patch_environment
+from .utils import PrecisionType, PrepareForLaunch, are_libraries_initialized, is_mps_available, patch_environment
 
 
 def test_launch():
@@ -142,6 +142,17 @@ def train(*args):
                     "inside your training function. Restart your notebook and make sure no cells initializes an "
                     "`Accelerator`."
                 )
+            # Check for specific libraries known to initialize CUDA that users constantly use
+            problematic_imports = are_libraries_initialized("bitsandbytes")
+            if len(problematic_imports) > 1:
+                err = (
+                    "Could not start distributed process. Libraries known to initialize CUDA upon import have been "
+                    "imported already. Please keep these imports inside your training function to try and help with this:"
+                )
+                for lib_name in problematic_imports:
+                    err += f"\n\t* `{lib_name}`"
+                raise RuntimeError(err)
+
             # torch.distributed will expect a few environment variable to be here. We set the ones common to each
             # process here (the other ones will be set be the launcher).
             with patch_environment(
diff --git a/src/accelerate/test_utils/scripts/test_notebook.py b/src/accelerate/test_utils/scripts/test_notebook.py
index 8f215d8fd19..999fab34cbe 100644
--- a/src/accelerate/test_utils/scripts/test_notebook.py
+++ b/src/accelerate/test_utils/scripts/test_notebook.py
@@ -1,17 +1,34 @@
 # Test file to ensure that in general certain situational setups for notebooks work.
-import argparse
+import os
+
+from pytest import raises
 
 from accelerate import PartialState, notebook_launcher
+from accelerate.test_utils import require_bnb
+from accelerate.utils import is_bnb_available
+
 
+def basic_function():
+    # Just prints the PartialState
+    print(f"PartialState:\n{PartialState()}")
 
-parser = argparse.ArgumentParser()
-parser.add_argument("--num_processes", type=int, default=1)
-args = parser.parse_args()
 
+NUM_PROCESSES = os.environ.get("ACCELERATE_NUM_PROCESSES", 1)
 
-def function():
-    print(f"PartialState:\n{PartialState()}")
+
+def test_can_initialize():
+    notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES)
+
+
+@require_bnb
+def test_problematic_imports():
+    with raises(AssertionError, match="Please keep these imports"):
+        notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES)
 
 
-if __name__ == "__main__":
-    notebook_launcher(function, num_processes=int(args.num_processes))
+def main():
+    print("Test basic notebook can be ran")
+    test_can_initialize()
+    if is_bnb_available():
+        print("Test problematic imports (bnb)")
+        test_problematic_imports()
diff --git a/src/accelerate/test_utils/testing.py b/src/accelerate/test_utils/testing.py
index 8a8b82f4e34..75725012181 100644
--- a/src/accelerate/test_utils/testing.py
+++ b/src/accelerate/test_utils/testing.py
@@ -431,13 +431,15 @@ class SubprocessCallException(Exception):
     pass
 
 
-def run_command(command: List[str], return_stdout=False):
+def run_command(command: List[str], return_stdout=False, env=None):
     """
     Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
     if an error occured while running `command`
     """
+    if env is None:
+        env = os.environ.copy()
     try:
-        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
+        output = subprocess.check_output(command, stderr=subprocess.STDOUT, env=env)
         if return_stdout:
             if hasattr(output, "decode"):
                 output = output.decode("utf-8")
diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
index 497c14abe28..702d9697acd 100644
--- a/src/accelerate/utils/__init__.py
+++ b/src/accelerate/utils/__init__.py
@@ -37,7 +37,13 @@
     TensorInformation,
     TorchDynamoPlugin,
 )
-from .environment import get_int_from_env, parse_choice_from_env, parse_flag_from_env, str_to_bool
+from .environment import (
+    are_libraries_initialized,
+    get_int_from_env,
+    parse_choice_from_env,
+    parse_flag_from_env,
+    str_to_bool,
+)
 from .imports import (
     get_ccl_version,
     is_4bit_bnb_available,
diff --git a/src/accelerate/utils/environment.py b/src/accelerate/utils/environment.py
index 0cd46c2dcaf..cff6e73f380 100644
--- a/src/accelerate/utils/environment.py
+++ b/src/accelerate/utils/environment.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import os
+import sys
+from typing import Dict
 
 
 def str_to_bool(value) -> int:
@@ -48,3 +50,10 @@ def parse_flag_from_env(key, default=False):
 def parse_choice_from_env(key, default="no"):
     value = os.environ.get(key, str(default))
     return value
+
+
+def are_libraries_initialized(*library_names: str) -> Dict[str, bool]:
+    """
+    Checks if any of `library_names` are imported in the environment. Will return results as a `key:bool` pair.
+    """
+    return [lib_name for lib_name in library_names if lib_name in sys.modules]
diff --git a/tests/test_multigpu.py b/tests/test_multigpu.py
index 73ee6367f5c..a479130b74f 100644
--- a/tests/test_multigpu.py
+++ b/tests/test_multigpu.py
@@ -21,7 +21,8 @@
 import accelerate
 from accelerate import Accelerator
 from accelerate.big_modeling import dispatch_model
-from accelerate.test_utils import assert_exception, execute_subprocess_async, require_multi_gpu, skip
+from accelerate.test_utils import assert_exception, execute_subprocess_async, require_multi_gpu
+from accelerate.test_utils.testing import run_command
 from accelerate.utils import patch_environment
 
 
@@ -33,6 +34,9 @@ def setUp(self):
             mod_file.split(os.path.sep)[:-1] + ["scripts", "test_distributed_data_loop.py"]
         )
         self.operation_file_path = os.path.sep.join(mod_file.split(os.path.sep)[:-1] + ["scripts", "test_ops.py"])
+        self.notebook_launcher_path = os.path.sep.join(
+            mod_file.split(os.path.sep)[:-1] + ["scripts", "test_notebook.py"]
+        )
 
     @require_multi_gpu
     def test_multi_gpu(self):
@@ -66,23 +70,16 @@ def test_distributed_data_loop(self):
         with patch_environment(omp_num_threads=1, cuda_visible_devices="0,1"):
             execute_subprocess_async(cmd, env=os.environ.copy())
 
-    # Need to see why this test raises forking issues when ran as a suite
-    @skip
     @require_multi_gpu
     def test_notebook_launcher(self):
         """
-        This test checks that the `notebook_launcher` will be able to intialize
-        a `PartialState` without issue
+        This test checks a variety of situations and scenarios
+        with the `notebook_launcher`
         """
-        cmd = [
-            "python",
-            "-m",
-            "accelerate.test_utils.scripts.test_notebook",
-            "--num_processes",
-            str(torch.cuda.device_count()),
-        ]
+        cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", self.notebook_launcher_path]
+        print(f"Running {cmd}")
         with patch_environment(omp_num_threads=1):
-            execute_subprocess_async(cmd, env=os.environ.copy())
+            run_command(cmd, env=os.environ.copy())
 
 
 if __name__ == "__main__":

From 62af7372198c8c3e0fbc43c7490ba1a4c015e2be Mon Sep 17 00:00:00 2001
From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com>
Date: Mon, 20 Nov 2023 04:24:30 -0800
Subject: [PATCH 10/24] Add ZeRO++ to DeepSpeed usage docs (#2166)

* added zeropp to deepspeed doc file

* minor edit to clarify hpz size
---
 docs/source/usage_guides/deepspeed.md | 30 +++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/docs/source/usage_guides/deepspeed.md b/docs/source/usage_guides/deepspeed.md
index 6f6350dcb2a..0404136ff3e 100644
--- a/docs/source/usage_guides/deepspeed.md
+++ b/docs/source/usage_guides/deepspeed.md
@@ -15,7 +15,7 @@ rendered properly in your Markdown viewer.
 
 # DeepSpeed 
 
-[DeepSpeed](https://github.com/microsoft/DeepSpeed) implements everything described in the [ZeRO paper](https://arxiv.org/abs/1910.02054). Currently, it provides full support for:
+[DeepSpeed](https://github.com/microsoft/DeepSpeed) implements everything described in the [ZeRO paper](https://arxiv.org/abs/1910.02054). Some of the salient optimizations are:
 
 1. Optimizer state partitioning (ZeRO stage 1)
 2. Gradient partitioning (ZeRO stage 2)
@@ -23,6 +23,7 @@ rendered properly in your Markdown viewer.
 4. Custom mixed precision training handling
 5. A range of fast CUDA-extension-based optimizers
 6. ZeRO-Offload to CPU and Disk/NVMe
+7. Heirarchical partitioning of model parameters (ZeRO++)
 
 ZeRO-Offload has its own dedicated paper: [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840). And NVMe-support is described in the paper [ZeRO-Infinity: Breaking the GPU
 Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857).
@@ -44,7 +45,7 @@ won't be possible on a single GPU.
 
 Training:
 
-1. DeepSpeed ZeRO training supports the full ZeRO stages 1, 2 and 3 as well as CPU/Disk offload of optimizer states, gradients and parameters. 
+1. 🤗 Accelerate integrates all features of DeepSpeed ZeRO. This includes all the ZeRO stages 1, 2 and 3 as well as ZeRO-Offload, ZeRO-Infinity (which can offload to disk/NVMe) and ZeRO++.
 Below is a short description of Data Parallelism using ZeRO - Zero Redundancy Optimizer along with diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
 ![ZeRO Data Parallelism](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png)
 
@@ -60,6 +61,8 @@ Below is a short description of Data Parallelism using ZeRO - Zero Redundancy Op
 
  e. **Param Offload**: Offloads the model parameters to CPU/Disk building on top of ZERO Stage 3
 
+ f. **Heirarchical Paritioning**: Enables efficient multi-node training with data-parallel training across nodes and ZeRO-3 sharding within a node, built on top of ZeRO Stage 3.
+
 <u>Note</u>: With respect to Disk Offload, the disk should be an NVME for decent speed but it technically works on any Disk
 
 Inference:
@@ -349,6 +352,27 @@ accelerate launch examples/by_feature/deepspeed_with_config_support.py \
 --report_to "wandb"\
 ```
 
+**ZeRO++ Config Example**
+You can use the the features of ZeRO++ by using the appropriate config parameters. Note that ZeRO++ is an extension for ZeRO Stage 3. Here is how the config file can be modified, from [DeepSpeed's ZeRO++ tutorial](https://www.deepspeed.ai/tutorials/zeropp/):
+
+```json
+{
+    "zero_optimization": {
+        "stage": 3,
+        "reduce_bucket_size": "auto",
+
+        "zero_quantized_weights": true,
+        "zero_hpz_partition_size": 8,
+        "zero_quantized_gradients": true,
+
+        "contiguous_gradients": true,
+        "overlap_comm": true
+    }
+}
+```
+
+For heirarchical partitioning, the partition size `zero_hpz_partition_size` should ideally be set to the number of GPUs per node. (For example, the above config file assumes 8 GPUs per node)
+
 **Important code changes when using DeepSpeed Config File**
 
 1. DeepSpeed Optimizers and Schedulers. For more information on these, 
@@ -683,6 +707,8 @@ Papers:
 - [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
 - [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
 - [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
+- [ZeRO++: Extremely Efficient Collective Communication for Giant Model Training](https://arxiv.org/abs/2306.10209)
+
 
 Finally, please, remember that 🤗 `Accelerate` only integrates DeepSpeed, therefore if you
 have any problems or questions with regards to DeepSpeed usage, please, file an issue with [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues).

From fbe00d7897c180a4ac67e5651ba263bb6d9400e8 Mon Sep 17 00:00:00 2001
From: Enming Yuan <2650576090@qq.com>
Date: Mon, 20 Nov 2023 20:53:10 +0800
Subject: [PATCH 11/24] Update dataclasses.py (#2168)

Bug fix: recompute_activation -> recompute_activations
---
 src/accelerate/utils/dataclasses.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
index e0e41568b0c..2d22d460aee 100644
--- a/src/accelerate/utils/dataclasses.py
+++ b/src/accelerate/utils/dataclasses.py
@@ -1043,7 +1043,7 @@ class MegatronLMPlugin:
         default=None,
         metadata={"help": "enable sequence parallelism"},
     )
-    recompute_activation: bool = field(
+    recompute_activations: bool = field(
         default=None,
         metadata={"help": "enable selective activation recomputation"},
     )
@@ -1196,8 +1196,8 @@ def __post_init__(self):
             self.num_micro_batches = int(os.environ.get(prefix + "NUM_MICRO_BATCHES", 1))
         if self.gradient_clipping is None:
             self.gradient_clipping = float(os.environ.get(prefix + "GRADIENT_CLIPPING", 1.0))
-        if self.recompute_activation is None:
-            self.recompute_activation = str_to_bool(os.environ.get(prefix + "RECOMPUTE_ACTIVATION", "False")) == 1
+        if self.recompute_activations is None:
+            self.recompute_activations = str_to_bool(os.environ.get(prefix + "RECOMPUTE_ACTIVATIONS", "False")) == 1
         if self.use_distributed_optimizer is None:
             self.use_distributed_optimizer = (
                 str_to_bool(os.environ.get(prefix + "USE_DISTRIBUTED_OPTIMIZER", "False")) == 1
@@ -1234,7 +1234,7 @@ def __post_init__(self):
             "eval_iters": self.eval_iters,
             "eval_interval": self.eval_interval,
         }
-        if self.recompute_activation:
+        if self.recompute_activations:
             self.megatron_lm_default_args["recompute_granularity"] = "selective"
         if self.tensorboard_dir is not None:
             self.megatron_lm_default_args["tensorboard_dir"] = self.tensorboard_dir

From 35b020635395e9834c645f06b39ae63e3d6799bf Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Mon, 20 Nov 2023 15:49:50 +0100
Subject: [PATCH 12/24] Fix non persistant buffer dispatch (#1941)

* offload only persistant buffer

* add tests and fix naming

* remove_non_persistant=True by default

* style

* style again

* fix hooks

* fix logic
---
 src/accelerate/hooks.py          | 18 +++++++++++----
 src/accelerate/utils/modeling.py | 33 +++++++++++++++++++++++++--
 tests/test_big_modeling.py       | 39 ++++++++++++++++++++++++++++++++
 tests/test_modeling_utils.py     | 24 ++++++++++++++++++++
 4 files changed, 108 insertions(+), 6 deletions(-)

diff --git a/src/accelerate/hooks.py b/src/accelerate/hooks.py
index c033e890f2d..d87f1c18db3 100644
--- a/src/accelerate/hooks.py
+++ b/src/accelerate/hooks.py
@@ -26,6 +26,7 @@
     send_to_device,
     set_module_tensor_to_device,
 )
+from .utils.modeling import get_non_persistent_buffers
 
 
 class ModelHook:
@@ -262,14 +263,17 @@ def init_hook(self, module):
                         module, include_buffers=self.offload_buffers, recurse=self.place_submodules
                     )
                 }
-
             for name, _ in named_module_tensors(
-                module, include_buffers=self.offload_buffers, recurse=self.place_submodules
+                module, include_buffers=self.offload_buffers, recurse=self.place_submodules, remove_non_persistent=True
             ):
                 set_module_tensor_to_device(module, name, "meta")
             if not self.offload_buffers and self.execution_device is not None:
                 for name, _ in module.named_buffers(recurse=self.place_submodules):
                     set_module_tensor_to_device(module, name, self.execution_device)
+            elif self.offload_buffers and self.execution_device is not None:
+                for name in get_non_persistent_buffers(module, recurse=self.place_submodules):
+                    set_module_tensor_to_device(module, name, self.execution_device)
+
         return module
 
     def pre_forward(self, module, *args, **kwargs):
@@ -277,7 +281,10 @@ def pre_forward(self, module, *args, **kwargs):
             self.input_device = find_device([args, kwargs])
         if self.offload:
             for name, _ in named_module_tensors(
-                module, include_buffers=self.offload_buffers, recurse=self.place_submodules
+                module,
+                include_buffers=self.offload_buffers,
+                recurse=self.place_submodules,
+                remove_non_persistent=True,
             ):
                 fp16_statistics = None
                 if "weight" in name and name.replace("weight", "SCB") in self.weights_map.keys():
@@ -294,7 +301,10 @@ def pre_forward(self, module, *args, **kwargs):
     def post_forward(self, module, output):
         if self.offload:
             for name, _ in named_module_tensors(
-                module, include_buffers=self.offload_buffers, recurse=self.place_submodules
+                module,
+                include_buffers=self.offload_buffers,
+                recurse=self.place_submodules,
+                remove_non_persistent=True,
             ):
                 set_module_tensor_to_device(module, name, "meta")
                 if type(module).__name__ == "Linear8bitLt":
diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py
index fe8358d0756..85481eae283 100644
--- a/src/accelerate/utils/modeling.py
+++ b/src/accelerate/utils/modeling.py
@@ -365,7 +365,9 @@ def set_module_tensor_to_device(
     torch.cuda.empty_cache()
 
 
-def named_module_tensors(module: nn.Module, include_buffers: bool = True, recurse: bool = False):
+def named_module_tensors(
+    module: nn.Module, include_buffers: bool = True, recurse: bool = False, remove_non_persistent: bool = False
+):
     """
     A helper function that gathers all the tensors (parameters + buffers) of a given module. If `include_buffers=True`
     it's the same as doing `module.named_parameters(recurse=recurse) + module.named_buffers(recurse=recurse)`.
@@ -377,13 +379,40 @@ def named_module_tensors(module: nn.Module, include_buffers: bool = True, recurs
             Whether or not to include the buffers in the result.
         recurse (`bool`, *optional`, defaults to `False`):
             Whether or not to go look in every submodule or just return the direct parameters and buffers.
+        remove_non_persistent (`bool`, *optional*, defaults to `False`):
+            Whether or not to remove the non persistent buffer from the buffers. Useful only when include_buffers =
+            True
     """
     for named_parameter in module.named_parameters(recurse=recurse):
         yield named_parameter
 
     if include_buffers:
+        non_persistent_buffers = set()
+        if remove_non_persistent:
+            non_persistent_buffers = get_non_persistent_buffers(module, recurse=recurse)
         for named_buffer in module.named_buffers(recurse=recurse):
-            yield named_buffer
+            name, _ = named_buffer
+            if name not in non_persistent_buffers:
+                yield named_buffer
+
+
+def get_non_persistent_buffers(module: nn.Module, recurse: bool = False):
+    """
+    Gather all non persistent buffers of a given modules into a set
+
+    Args:
+        module (`nn.Module`):
+            The module we want the non persistent buffers on.
+        recurse (`bool`, *optional*, defaults to `False`):
+            Whether or not to go look in every submodule or just return the direct non persistent buffers.
+    """
+
+    non_persistent_buffers_set = module._non_persistent_buffers_set
+    if recurse:
+        for _, m in module.named_modules():
+            non_persistent_buffers_set |= m._non_persistent_buffers_set
+
+    return non_persistent_buffers_set
 
 
 class FindTiedParametersResult(list):
diff --git a/tests/test_big_modeling.py b/tests/test_big_modeling.py
index 47aeb7146ff..51ce4a899e4 100644
--- a/tests/test_big_modeling.py
+++ b/tests/test_big_modeling.py
@@ -45,6 +45,33 @@ def forward(self, x):
         return self.linear2(self.batchnorm(self.linear1(x)))
 
 
+class LinearWithNonPersistentBuffers(nn.Module):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.register_buffer("weight", torch.ones((out_features, in_features), **factory_kwargs))
+        if bias:
+            self.register_buffer("bias", torch.ones(out_features, **factory_kwargs), persistent=False)
+        else:
+            self.register_buffer("bias", None)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.linear(input, self.weight, self.bias)
+
+
+class ModelForTestNonPersistentBuffers(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = LinearWithNonPersistentBuffers(3, 4)
+        self.batchnorm = nn.BatchNorm1d(4)
+        self.linear2 = LinearWithNonPersistentBuffers(4, 5)
+
+    def forward(self, x):
+        return self.linear2(self.batchnorm(self.linear1(x)))
+
+
 class ModelForTestCopy(nn.Module):
     def __init__(self, id: int):
         super().__init__()
@@ -302,6 +329,18 @@ def test_dispatch_model(self):
             output = model(x)
             self.assertTrue(torch.allclose(expected, output.cpu(), atol=1e-5))
 
+    @require_cuda
+    def test_dispatch_model_with_non_persistent_buffers(self):
+        model = ModelForTestNonPersistentBuffers()
+        device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": "disk"}
+        x = torch.randn(2, 3)
+        expected = model(x)
+
+        with TemporaryDirectory() as tmp_dir:
+            dispatch_model(model, device_map, offload_dir=tmp_dir, offload_buffers=True)
+            output = model(x)
+            self.assertTrue(torch.allclose(expected, output.cpu(), atol=1e-5))
+
     @require_mps
     def test_dispatch_model_mps(self):
         model = ModelForTest()
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 7f7bf4c613a..d258938fe44 100644
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -51,6 +51,22 @@ def forward(self, x):
         return self.linear2(self.batchnorm(self.linear1(x)))
 
 
+class LinearWithNonPersistentBuffers(nn.Module):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.register_buffer("weight", torch.empty((out_features, in_features), **factory_kwargs))
+        if bias:
+            self.register_buffer("bias", torch.empty(out_features, **factory_kwargs), persistent=False)
+        else:
+            self.register_buffer("bias", None)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.linear(input, self.weight, self.bias)
+
+
 def sequential_model(num_layers):
     layers = OrderedDict([(f"linear{i}", nn.Linear(1000, 1000)) for i in range(1, num_layers + 1)])
     return nn.Sequential(layers)
@@ -187,6 +203,14 @@ def test_named_tensors(self):
             ["linear1.weight", "linear1.bias", "batchnorm.weight", "batchnorm.bias", "linear2.weight", "linear2.bias"],
         )
 
+        model = LinearWithNonPersistentBuffers(10, 10)
+
+        named_tensors = named_module_tensors(model, include_buffers=True, remove_non_persistent=False)
+        self.assertListEqual([name for name, _ in named_tensors], ["weight", "bias"])
+
+        named_tensors = named_module_tensors(model, include_buffers=True, remove_non_persistent=True)
+        self.assertListEqual([name for name, _ in named_tensors], ["weight"])
+
     def test_find_tied_parameters(self):
         model = sequential_model(4)
         self.assertListEqual(find_tied_parameters(model), [])

From 427ef8bd009f25c5c1beabd25435266b2704caaf Mon Sep 17 00:00:00 2001
From: Antoni-Joan Solergibert
 <74564958+TJ-Solergibert@users.noreply.github.com>
Date: Mon, 20 Nov 2023 16:42:49 +0100
Subject: [PATCH 13/24] Updated torchrun instructions (#2096)

* Updated torchrun instructions

* Update examples/README.md

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

* Update examples/README.md

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

* Update examples/README.md

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

* Update examples/README.md

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

* Update README.md for torchrun instructions

* Added SLURM scripts and updated README

* Update examples/Slurm/submit-multinode.sh

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* Update examples/Slurm/submit-multiGPU.sh

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* Update examples/README.md

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* Update examples/README.md

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* final details

* modified argument parser

* modified slurm multigpu script

* modified multinode slurm script

* Added accelerate multine issue

* Update examples/README.md

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* fixed readme commnad

* added --main_process_port specification to readme

* Revert "modified argument parser"

This reverts commit c3bef5cdd11a8a120602b5b7ce158f7400881d7f.

---------

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Co-authored-by: Zach Mueller <muellerzr@gmail.com>
---
 examples/README.md                 | 53 +++++++++++++++---------------
 examples/slurm/submit_multigpu.sh  | 27 +++++++++++++++
 examples/slurm/submit_multinode.sh | 41 +++++++++++++++++++++++
 3 files changed, 95 insertions(+), 26 deletions(-)
 create mode 100644 examples/slurm/submit_multigpu.sh
 create mode 100644 examples/slurm/submit_multinode.sh

diff --git a/examples/README.md b/examples/README.md
index f525607aad3..33e636abfba 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -64,9 +64,9 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on your server
         accelerate launch ./nlp_example.py  # This will run the script on your server
         ```
-    * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch)
+    * With traditional PyTorch launcher (`python -m torch.distributed.run` can be used instead of `torchrun`)
         ```bash
-        python -m torchrun --nproc_per_node 2 --use_env ./nlp_example.py
+        torchrun --nproc_per_node 2 ./nlp_example.py
         ```
 - multi GPUs, multi node (several machines, using PyTorch distributed mode)
     * With Accelerate config and launcher, on each machine:
@@ -74,18 +74,15 @@ To run it in each of these various modes, use the following commands:
         accelerate config  # This will create a config file on each server
         accelerate launch ./nlp_example.py  # This will run the script on each server
         ```
-    * With PyTorch launcher only (`torch.distributed.launch` can be used in older versions of PyTorch)
+    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node:
         ```bash
-        python -m torchrun --nproc_per_node 2 \
-            --use_env \
-            --node_rank 0 \
-            --master_addr master_node_ip_address \
-            ./nlp_example.py  # On the first server
-        python -m torchrun --nproc_per_node 2 \
-            --use_env \
-            --node_rank 1 \
-            --master_addr master_node_ip_address \
-            ./nlp_example.py  # On the second server
+        torchrun \ # python -m torch.distributed.run 
+            --nproc_per_node 2 \
+            --nnodes 2 \
+            --rdzv_id 2299 \ # A unique job id 
+            --rdzv_backend c10d \
+            --rdzv_endpoint master_node_ip_address:29500 \
+            ./nlp_example.py
         ```
 - (multi) TPUs
     * With Accelerate config and launcher
@@ -152,9 +149,9 @@ To run it in each of these various modes, use the following commands:
         accelerate config --config_file config.yaml  # This will create a config file on your server to `config.yaml`
         accelerate launch --config_file config.yaml ./cv_example.py --data_dir path_to_data  # This will run the script on your server
         ```
-    * With traditional PyTorch launcher (`torch.distributed.launch` can be used with older versions of PyTorch)
+    * With traditional PyTorch launcher (`python -m torch.distributed.run` can be used instead of `torchrun`)
         ```bash
-        python -m torchrun --nproc_per_node 2 --use_env ./cv_example.py --data_dir path_to_data
+        torchrun --nproc_per_node 2 ./cv_example.py --data_dir path_to_data
         ```
 - multi GPUs, multi node (several machines, using PyTorch distributed mode)
     * With Accelerate config and launcher, on each machine:
@@ -162,18 +159,15 @@ To run it in each of these various modes, use the following commands:
         accelerate config --config_file config.yaml  # This will create a config file on your server to `config.yaml`
         accelerate launch --config_file config.yaml ./cv_example.py --data_dir path_to_data  # This will run the script on each server
         ```
-    * With PyTorch launcher only (`torch.distributed.launch` can be used with older versions of PyTorch)
+    * With PyTorch launcher only (`python -m torch.distributed.run` can be used instead of `torchrun`). Run this command on each node:
         ```bash
-        python -m torchrun --nproc_per_node 2 \
-            --use_env \
-            --node_rank 0 \
-            --master_addr master_node_ip_address \
-            ./cv_example.py --data_dir path_to_data  # On the first server
-        python -m torchrun --nproc_per_node 2 \
-            --use_env \
-            --node_rank 1 \
-            --master_addr master_node_ip_address \
-            ./cv_example.py --data_dir path_to_data  # On the second server
+        torchrun \ # python -m torch.distributed.run
+            --nproc_per_node 2 \
+            --nnodes 2 \
+            --rdzv_id 2299 \ # A unique job id 
+            --rdzv_backend c10d \
+            --rdzv_endpoint master_node_ip_address:29500 \
+            ./cv_example.py --data_dir path_to_data
         ```
 - (multi) TPUs
     * With Accelerate config and launcher
@@ -206,6 +200,13 @@ with `pip install runhouse`, and you can refer to
 for hardware setup instructions, or this
 [Colab tutorial](https://colab.research.google.com/drive/1qVwYyLTCPYPSdz9ZX7BZl9Qm0A3j7RJe) for a more in-depth walkthrough.
 
+## SLURM Scripts 
+In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) and [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we present two scripts for running the examples on a machine with [SLURM](https://slurm.schedmd.com/documentation.html) workload manager. 
+
+In [/slurm/submit_multigpu.sh](./slurm/submit_multigpu.sh) the only parameter in the launcher that needs to be modified is `--num_processes`, which determines the number of GPUs we will use. In this case, using the environment variable `$SLURM_GPUS`, we indicate that we want to utilize all the GPUs available on the node we have requested. 
+
+In [/slurm/submit_multinode.sh](./slurm/submit_multinode.sh) we must specify the number of nodes that will be part of the training (`--num_machines`), how many GPUs we will use in total (`--num_processes`), the [`backend`](https://pytorch.org/docs/stable/elastic/run.html#note-on-rendezvous-backend), `--main_process_ip` which will be the address the master node and the `--main_process_port`.
+
 ## Finer Examples
 
 While the first two scripts are extremely barebones when it comes to what you can do with accelerate, more advanced features are documented in two other locations.
diff --git a/examples/slurm/submit_multigpu.sh b/examples/slurm/submit_multigpu.sh
new file mode 100644
index 00000000000..709d7490064
--- /dev/null
+++ b/examples/slurm/submit_multigpu.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+#SBATCH --job-name=multigpu
+#SBATCH -D .
+#SBATCH --output=O-%x.%j
+#SBATCH --error=E-%x.%j
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1         # number of MP tasks
+#SBATCH --gres=gpu:4                # number of GPUs per node
+#SBATCH --cpus-per-task=160         # number of cores per tasks
+#SBATCH --time=01:59:00             # maximum execution time (HH:MM:SS)
+
+######################
+### Set enviroment ###
+######################
+source activateEnviroment.sh
+export GPUS_PER_NODE=4
+######################
+
+export SCRIPT=/accelerate/examples/complete_nlp_example.py
+export SCRIPT_ARGS=" \
+    --mixed_precision fp16 \
+    --output_dir /accelerate/examples/output \
+    --with_tracking \
+    "
+
+accelerate launch --num_processes $GPUS_PER_NODE $SCRIPT $SCRIPT_ARGS
\ No newline at end of file
diff --git a/examples/slurm/submit_multinode.sh b/examples/slurm/submit_multinode.sh
new file mode 100644
index 00000000000..dfd3fa023c5
--- /dev/null
+++ b/examples/slurm/submit_multinode.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+#SBATCH --job-name=multinode
+#SBATCH -D .
+#SBATCH --output=O-%x.%j
+#SBATCH --error=E-%x.%j
+#SBATCH --nodes=4                   # number of nodes
+#SBATCH --ntasks-per-node=1         # number of MP tasks
+#SBATCH --gres=gpu:4                # number of GPUs per node
+#SBATCH --cpus-per-task=160         # number of cores per tasks
+#SBATCH --time=01:59:00             # maximum execution time (HH:MM:SS)
+
+######################
+### Set enviroment ###
+######################
+source activateEnviroment.sh
+export GPUS_PER_NODE=4
+######################
+
+######################
+#### Set network #####
+######################
+head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+######################
+
+export LAUNCHER="accelerate launch \
+    --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
+    --num_machines $SLURM_NNODES \
+    --rdzv_backend c10d \
+    --main_process_ip $head_node_ip \
+    --main_process_port 29500 \
+    "
+export SCRIPT="/accelerate/examples/complete_nlp_example.py"
+export SCRIPT_ARGS=" \
+    --mixed_precision fp16 \
+    --output_dir /accelerate/examples/output \
+    "
+    
+# This step is necessary because accelerate launch does not handle multiline arguments properly
+export CMD="$LAUNCHER $PYTHON_FILE $ARGS" 
+srun $CMD
\ No newline at end of file

From ca300c0a04f843da2c5c8559e7d728926f7e8bf2 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 20 Nov 2023 11:41:57 -0500
Subject: [PATCH 14/24] New CI Runners (#2087)

* Try merge tests

* Fix

* Checkout branch

* Fix pip install

* rebase

* Colons

* right one

* use master

* Rm

* Add needs

* Better clean

* always

* Forgot other

* test on AWS

* update all labels

* fix multi-gpu working directory

* limit to 2 GPU

* force run on kube

* move build docker image to new ci

* test build on CPU instance

* move build docker image release to new ci

* move scheduled slow tests to new ci

* move integration test to new ci

* Comments

* Right CPU tags

* Right machines

* PR comments

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
---
 .../workflows/build-docker-images-release.yml |  4 +-
 .github/workflows/build_docker_images.yml     | 15 +----
 .github/workflows/nightly.yml                 | 23 +++++---
 .github/workflows/run_merge_tests.yml         | 59 +++++++++++--------
 .../self_hosted_integration_tests.yml         | 44 +++++++-------
 5 files changed, 75 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/build-docker-images-release.yml b/.github/workflows/build-docker-images-release.yml
index 8d5f2194b78..efb6a95da6c 100644
--- a/.github/workflows/build-docker-images-release.yml
+++ b/.github/workflows/build-docker-images-release.yml
@@ -21,7 +21,7 @@ jobs:
 
   version-cpu:
     name: "Latest Accelerate CPU [version]"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, intel-cpu, 8-cpu, ci]
     needs: get-version
     steps:
       - name: Set up Docker Buildx
@@ -41,7 +41,7 @@ jobs:
 
   version-cuda:
     name: "Latest Accelerate GPU [version]"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci]
     needs: get-version
     steps:
       - name: Set up Docker Buildx
diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml
index 59f3e4dda61..75b9fb9eefe 100644
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -11,19 +11,9 @@ concurrency:
   cancel-in-progress: false
 
 jobs:
-  clean-storage:
-    name: "Clean docker image storage"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
-    steps:
-      - name: Clean storage
-        run: |
-          docker image prune --all -f --filter "until=48h"
-          docker system prune --all -f --filter "until=48h"
-
   latest-cpu:
     name: "Latest Accelerate CPU [dev]"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
-    needs: clean-storage
+    runs-on: [self-hosted, intel-cpu, 8-cpu, ci]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
@@ -41,8 +31,7 @@ jobs:
 
   latest-cuda:
     name: "Latest Accelerate GPU [dev]"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
-    needs: clean-storage
+    runs-on: [self-hosted, nvidia-gpu, t4, daily-ci]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 09e64e060e3..a06cae176c7 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -13,7 +13,7 @@ env:
 
 jobs:
   run_all_tests_single_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci]
     env:
       CUDA_VISIBLE_DEVICES: "0"
       TEST_TYPE: "single_gpu"
@@ -22,23 +22,25 @@ jobs:
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
-        working-directory: accelerate/
         shell: bash
     steps:
       - name: Update clone & pip install
         run: |
           source activate accelerate
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }} 
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
           pip install -e . --no-deps
           pip install pytest-reportlog tabulate
 
       - name: Run test on GPUs
+        working-directory: accelerate
         run: |
           source activate accelerate
           make test
           
       - name: Run examples on GPUs
+        working-directory: accelerate
         if: always()
         run: |
           source activate accelerate
@@ -46,13 +48,14 @@ jobs:
           make test_examples
           
       - name: Generate Report
+        working-directory: accelerate
         if: always()
         run: |
           pip install slack_sdk tabulate
           python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
   run_all_tests_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, daily-ci]
     env:
       CUDA_VISIBLE_DEVICES: "0,1"
       TEST_TYPE: "multi_gpu"
@@ -61,18 +64,19 @@ jobs:
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
-        working-directory: accelerate/
         shell: bash
     steps:
       - name: Update clone
         run: |
           source activate accelerate
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }}
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
           pip install -e . --no-deps
           pip install pytest-reportlog tabulate
 
       - name: Run core and big modeling tests on GPUs
+        working-directory: accelerate
         run: |
           source activate accelerate
           make test_core
@@ -80,12 +84,14 @@ jobs:
           make test_cli
 
       - name: Run Integration tests on GPUs
+        working-directory: accelerate
         if: always()
         run: |
           source activate accelerate
           make test_integrations
 
       - name: Run examples on GPUs
+        working-directory: accelerate
         if: always()
         run: |
           source activate accelerate
@@ -93,6 +99,7 @@ jobs:
           make test_examples
 
       - name: Generate Report
+        working-directory: accelerate
         if: always()
         run: |
           pip install slack_sdk tabulate
diff --git a/.github/workflows/run_merge_tests.yml b/.github/workflows/run_merge_tests.yml
index 7dacab8c508..42bf11e7516 100644
--- a/.github/workflows/run_merge_tests.yml
+++ b/.github/workflows/run_merge_tests.yml
@@ -10,7 +10,7 @@ env:
 
 jobs:
   run_all_tests_single_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, push-ci]
     env:
       CUDA_VISIBLE_DEVICES: "0"
     container:
@@ -18,72 +18,81 @@ jobs:
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
-        working-directory: accelerate/
         shell: bash
     steps:
-      - name: Update clone & pip install
+      - name: Install accelerate
         run: |
-          source activate accelerate
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }}
-          pip install -e .[testing,test_trackers] -U
-          pip install pytest-reportlog tabulate
+          source activate accelerate;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e .[testing,test_trackers] -U;
+          pip install pytest-reportlog tabulate  ;
 
-      - name: Run CLI tests
+      - name: Run CLI tests (use make cli)
+        working-directory: accelerate
         run: |
-          source activate accelerate
+          source activate accelerate;
           make test_cli
           
       - name: Run test on GPUs
+        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate
+          source activate accelerate;
           make test
       - name: Run examples on GPUs
+        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate
-          pip uninstall comet_ml -y
+          source activate accelerate;
+          pip uninstall comet_ml -y;
           make test_examples
 
       - name: Generate Report
+        working-directory: accelerate
         if: always()
         run: |
-          pip install tabulate
+          pip install tabulate;
           python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
   run_all_tests_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci]
+    env:
+      CUDA_VISIBLE_DEVICES: 0,1
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
-        working-directory: accelerate/
         shell: bash
     steps:
       - name: Update clone
         run: |
-          source activate accelerate
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }}
-          pip install -e .[testing,test_trackers] -U
+          source activate accelerate;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e .[testing,test_trackers] -U;
           pip install pytest-reportlog tabulate
 
       - name: Run test on GPUs
+        working-directory: accelerate
         run: |
-          source activate accelerate
+          source activate accelerate;
           make test
 
       - name: Run examples on GPUs
+        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate
-          pip uninstall comet_ml -y
+          source activate accelerate;
+          pip uninstall comet_ml -y;
           make test_examples
 
       - name: Generate Report
+        working-directory: accelerate
         if: always()
         run: |
-          pip install tabulate
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
\ No newline at end of file
+          source activate accelerate;
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml
index 94e50e61ff3..cd82295e4e2 100644
--- a/.github/workflows/self_hosted_integration_tests.yml
+++ b/.github/workflows/self_hosted_integration_tests.yml
@@ -25,7 +25,7 @@ jobs:
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci]
     strategy:
       fail-fast: false
       matrix:
@@ -34,22 +34,22 @@ jobs:
           "0,1"
         ]
     steps:
-      - name: Update accelerate clone and pip install
-        working-directory: accelerate/
-        run: 
+      - name: Install transformers
+        run: |
           source activate accelerate;
-          git config --global --add safe.directory '*';
-          git checkout main && git fetch && git checkout ${{ github.sha }};
-          pip install -e .;
+          git clone https://github.com/huggingface/transformers --depth 1;
+          cd transformers;
+          pip install .[torch,deepspeed-testing];
+          cd ..;
 
-      - name: Update transformers clone & pip install
-        working-directory: transformers/
+      - name: Install accelerate
         run: |
-          source activate accelerate
-          git config --global --add safe.directory '*'
-          git checkout main && git pull
-          pip install .[torch,deepspeed-testing]
-          pip uninstall comet_ml wandb -y
+          source activate accelerate;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }} ;
+          pip install -e .[testing];
+          cd ..;
       
       - name: Show installed libraries
         run: |
@@ -89,20 +89,20 @@ jobs:
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci]
     strategy:
       fail-fast: false
     steps:
-      - name: Update accelerate clone and pip install
-        working-directory: accelerate/
+      - name: Install accelerate
         run: 
           source activate accelerate;
-          git config --global --add safe.directory '*';
-          git checkout main && git fetch && git checkout ${{ github.sha }};
-          pip install -e .;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e .[testing];
+          cd ..
 
-      - name: Update skorch clone & pip install
-        working-directory: skorch/
+      - name: Install skorch
         run: |
           source activate accelerate
           git config --global --add safe.directory '*'

From 2b25b8b3c59b82e36d798537656f8d348bc86b6a Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 20 Nov 2023 12:06:33 -0500
Subject: [PATCH 15/24] Revert "New CI Runners (#2087)" (#2172)

This reverts commit ca300c0a04f843da2c5c8559e7d728926f7e8bf2.
---
 .../workflows/build-docker-images-release.yml |  4 +-
 .github/workflows/build_docker_images.yml     | 15 ++++-
 .github/workflows/nightly.yml                 | 23 +++-----
 .github/workflows/run_merge_tests.yml         | 59 ++++++++-----------
 .../self_hosted_integration_tests.yml         | 44 +++++++-------
 5 files changed, 70 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/build-docker-images-release.yml b/.github/workflows/build-docker-images-release.yml
index efb6a95da6c..8d5f2194b78 100644
--- a/.github/workflows/build-docker-images-release.yml
+++ b/.github/workflows/build-docker-images-release.yml
@@ -21,7 +21,7 @@ jobs:
 
   version-cpu:
     name: "Latest Accelerate CPU [version]"
-    runs-on: [self-hosted, intel-cpu, 8-cpu, ci]
+    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
     needs: get-version
     steps:
       - name: Set up Docker Buildx
@@ -41,7 +41,7 @@ jobs:
 
   version-cuda:
     name: "Latest Accelerate GPU [version]"
-    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci]
+    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
     needs: get-version
     steps:
       - name: Set up Docker Buildx
diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml
index 75b9fb9eefe..59f3e4dda61 100644
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -11,9 +11,19 @@ concurrency:
   cancel-in-progress: false
 
 jobs:
+  clean-storage:
+    name: "Clean docker image storage"
+    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    steps:
+      - name: Clean storage
+        run: |
+          docker image prune --all -f --filter "until=48h"
+          docker system prune --all -f --filter "until=48h"
+
   latest-cpu:
     name: "Latest Accelerate CPU [dev]"
-    runs-on: [self-hosted, intel-cpu, 8-cpu, ci]
+    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    needs: clean-storage
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
@@ -31,7 +41,8 @@ jobs:
 
   latest-cuda:
     name: "Latest Accelerate GPU [dev]"
-    runs-on: [self-hosted, nvidia-gpu, t4, daily-ci]
+    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    needs: clean-storage
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index a06cae176c7..09e64e060e3 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -13,7 +13,7 @@ env:
 
 jobs:
   run_all_tests_single_gpu:
-    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci]
+    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
     env:
       CUDA_VISIBLE_DEVICES: "0"
       TEST_TYPE: "single_gpu"
@@ -22,25 +22,23 @@ jobs:
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
+        working-directory: accelerate/
         shell: bash
     steps:
       - name: Update clone & pip install
         run: |
           source activate accelerate
-          git clone https://github.com/huggingface/accelerate;
-          cd accelerate;
-          git checkout ${{ github.sha }};
+          git config --global --add safe.directory '*'
+          git fetch && git checkout ${{ github.sha }} 
           pip install -e . --no-deps
           pip install pytest-reportlog tabulate
 
       - name: Run test on GPUs
-        working-directory: accelerate
         run: |
           source activate accelerate
           make test
           
       - name: Run examples on GPUs
-        working-directory: accelerate
         if: always()
         run: |
           source activate accelerate
@@ -48,14 +46,13 @@ jobs:
           make test_examples
           
       - name: Generate Report
-        working-directory: accelerate
         if: always()
         run: |
           pip install slack_sdk tabulate
           python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
   run_all_tests_multi_gpu:
-    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, daily-ci]
+    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
     env:
       CUDA_VISIBLE_DEVICES: "0,1"
       TEST_TYPE: "multi_gpu"
@@ -64,19 +61,18 @@ jobs:
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
+        working-directory: accelerate/
         shell: bash
     steps:
       - name: Update clone
         run: |
           source activate accelerate
-          git clone https://github.com/huggingface/accelerate;
-          cd accelerate;
-          git checkout ${{ github.sha }};
+          git config --global --add safe.directory '*'
+          git fetch && git checkout ${{ github.sha }}
           pip install -e . --no-deps
           pip install pytest-reportlog tabulate
 
       - name: Run core and big modeling tests on GPUs
-        working-directory: accelerate
         run: |
           source activate accelerate
           make test_core
@@ -84,14 +80,12 @@ jobs:
           make test_cli
 
       - name: Run Integration tests on GPUs
-        working-directory: accelerate
         if: always()
         run: |
           source activate accelerate
           make test_integrations
 
       - name: Run examples on GPUs
-        working-directory: accelerate
         if: always()
         run: |
           source activate accelerate
@@ -99,7 +93,6 @@ jobs:
           make test_examples
 
       - name: Generate Report
-        working-directory: accelerate
         if: always()
         run: |
           pip install slack_sdk tabulate
diff --git a/.github/workflows/run_merge_tests.yml b/.github/workflows/run_merge_tests.yml
index 42bf11e7516..7dacab8c508 100644
--- a/.github/workflows/run_merge_tests.yml
+++ b/.github/workflows/run_merge_tests.yml
@@ -10,7 +10,7 @@ env:
 
 jobs:
   run_all_tests_single_gpu:
-    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, push-ci]
+    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
     env:
       CUDA_VISIBLE_DEVICES: "0"
     container:
@@ -18,81 +18,72 @@ jobs:
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
+        working-directory: accelerate/
         shell: bash
     steps:
-      - name: Install accelerate
+      - name: Update clone & pip install
         run: |
-          source activate accelerate;
-          git clone https://github.com/huggingface/accelerate;
-          cd accelerate;
-          git checkout ${{ github.sha }};
-          pip install -e .[testing,test_trackers] -U;
-          pip install pytest-reportlog tabulate  ;
+          source activate accelerate
+          git config --global --add safe.directory '*'
+          git fetch && git checkout ${{ github.sha }}
+          pip install -e .[testing,test_trackers] -U
+          pip install pytest-reportlog tabulate
 
-      - name: Run CLI tests (use make cli)
-        working-directory: accelerate
+      - name: Run CLI tests
         run: |
-          source activate accelerate;
+          source activate accelerate
           make test_cli
           
       - name: Run test on GPUs
-        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate;
+          source activate accelerate
           make test
       - name: Run examples on GPUs
-        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate;
-          pip uninstall comet_ml -y;
+          source activate accelerate
+          pip uninstall comet_ml -y
           make test_examples
 
       - name: Generate Report
-        working-directory: accelerate
         if: always()
         run: |
-          pip install tabulate;
+          pip install tabulate
           python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
   run_all_tests_multi_gpu:
-    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci]
-    env:
-      CUDA_VISIBLE_DEVICES: 0,1
+    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
+        working-directory: accelerate/
         shell: bash
     steps:
       - name: Update clone
         run: |
-          source activate accelerate;
-          git clone https://github.com/huggingface/accelerate;
-          cd accelerate;
-          git checkout ${{ github.sha }};
-          pip install -e .[testing,test_trackers] -U;
+          source activate accelerate
+          git config --global --add safe.directory '*'
+          git fetch && git checkout ${{ github.sha }}
+          pip install -e .[testing,test_trackers] -U
           pip install pytest-reportlog tabulate
 
       - name: Run test on GPUs
-        working-directory: accelerate
         run: |
-          source activate accelerate;
+          source activate accelerate
           make test
 
       - name: Run examples on GPUs
-        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate;
-          pip uninstall comet_ml -y;
+          source activate accelerate
+          pip uninstall comet_ml -y
           make test_examples
 
       - name: Generate Report
-        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate;
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+          pip install tabulate
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
\ No newline at end of file
diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml
index cd82295e4e2..94e50e61ff3 100644
--- a/.github/workflows/self_hosted_integration_tests.yml
+++ b/.github/workflows/self_hosted_integration_tests.yml
@@ -25,7 +25,7 @@ jobs:
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
-    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci]
+    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
     strategy:
       fail-fast: false
       matrix:
@@ -34,22 +34,22 @@ jobs:
           "0,1"
         ]
     steps:
-      - name: Install transformers
-        run: |
+      - name: Update accelerate clone and pip install
+        working-directory: accelerate/
+        run: 
           source activate accelerate;
-          git clone https://github.com/huggingface/transformers --depth 1;
-          cd transformers;
-          pip install .[torch,deepspeed-testing];
-          cd ..;
+          git config --global --add safe.directory '*';
+          git checkout main && git fetch && git checkout ${{ github.sha }};
+          pip install -e .;
 
-      - name: Install accelerate
+      - name: Update transformers clone & pip install
+        working-directory: transformers/
         run: |
-          source activate accelerate;
-          git clone https://github.com/huggingface/accelerate;
-          cd accelerate;
-          git checkout ${{ github.sha }} ;
-          pip install -e .[testing];
-          cd ..;
+          source activate accelerate
+          git config --global --add safe.directory '*'
+          git checkout main && git pull
+          pip install .[torch,deepspeed-testing]
+          pip uninstall comet_ml wandb -y
       
       - name: Show installed libraries
         run: |
@@ -89,20 +89,20 @@ jobs:
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
-    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci]
+    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
     strategy:
       fail-fast: false
     steps:
-      - name: Install accelerate
+      - name: Update accelerate clone and pip install
+        working-directory: accelerate/
         run: 
           source activate accelerate;
-          git clone https://github.com/huggingface/accelerate;
-          cd accelerate;
-          git checkout ${{ github.sha }};
-          pip install -e .[testing];
-          cd ..
+          git config --global --add safe.directory '*';
+          git checkout main && git fetch && git checkout ${{ github.sha }};
+          pip install -e .;
 
-      - name: Install skorch
+      - name: Update skorch clone & pip install
+        working-directory: skorch/
         run: |
           source activate accelerate
           git config --global --add safe.directory '*'

From 1243191ecbbd6a91a221e3cb56276b8e95f6c028 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 20 Nov 2023 13:01:12 -0500
Subject: [PATCH 16/24] [Working again] New CI (#2173)

* Try merge tests

* Fix

* Checkout branch

* Fix pip install

* rebase

* Colons

* right one

* use master

* Rm

* Add needs

* Better clean

* always

* Forgot other

* test on AWS

* update all labels

* fix multi-gpu working directory

* limit to 2 GPU

* force run on kube

* move build docker image to new ci

* test build on CPU instance

* move build docker image release to new ci

* move scheduled slow tests to new ci

* move integration test to new ci

* Comments

* Right CPU tags

* Right machines

* PR comments

* Fix issues

* Some trailers

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
---
 .../workflows/build-docker-images-release.yml |  4 +-
 .github/workflows/build_docker_images.yml     | 15 +----
 .github/workflows/nightly.yml                 | 23 +++++---
 .github/workflows/run_merge_tests.yml         | 59 +++++++++++--------
 .../self_hosted_integration_tests.yml         | 44 +++++++-------
 5 files changed, 75 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/build-docker-images-release.yml b/.github/workflows/build-docker-images-release.yml
index 8d5f2194b78..20a9ea51e5a 100644
--- a/.github/workflows/build-docker-images-release.yml
+++ b/.github/workflows/build-docker-images-release.yml
@@ -21,7 +21,7 @@ jobs:
 
   version-cpu:
     name: "Latest Accelerate CPU [version]"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, intel-cpu, 8-cpu, ci]
     needs: get-version
     steps:
       - name: Set up Docker Buildx
@@ -41,7 +41,7 @@ jobs:
 
   version-cuda:
     name: "Latest Accelerate GPU [version]"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
     needs: get-version
     steps:
       - name: Set up Docker Buildx
diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml
index 59f3e4dda61..557032a0de2 100644
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -11,19 +11,9 @@ concurrency:
   cancel-in-progress: false
 
 jobs:
-  clean-storage:
-    name: "Clean docker image storage"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
-    steps:
-      - name: Clean storage
-        run: |
-          docker image prune --all -f --filter "until=48h"
-          docker system prune --all -f --filter "until=48h"
-
   latest-cpu:
     name: "Latest Accelerate CPU [dev]"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
-    needs: clean-storage
+    runs-on: [self-hosted, intel-cpu, 8-cpu, ci]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
@@ -41,8 +31,7 @@ jobs:
 
   latest-cuda:
     name: "Latest Accelerate GPU [dev]"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
-    needs: clean-storage
+    runs-on: [self-hosted, nvidia-gpu, t4, ci]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 09e64e060e3..3e7f77bf9fd 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -13,7 +13,7 @@ env:
 
 jobs:
   run_all_tests_single_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
     env:
       CUDA_VISIBLE_DEVICES: "0"
       TEST_TYPE: "single_gpu"
@@ -22,23 +22,25 @@ jobs:
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
-        working-directory: accelerate/
         shell: bash
     steps:
       - name: Update clone & pip install
         run: |
           source activate accelerate
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }} 
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
           pip install -e . --no-deps
           pip install pytest-reportlog tabulate
 
       - name: Run test on GPUs
+        working-directory: accelerate
         run: |
           source activate accelerate
           make test
           
       - name: Run examples on GPUs
+        working-directory: accelerate
         if: always()
         run: |
           source activate accelerate
@@ -46,13 +48,14 @@ jobs:
           make test_examples
           
       - name: Generate Report
+        working-directory: accelerate
         if: always()
         run: |
           pip install slack_sdk tabulate
           python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
   run_all_tests_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
     env:
       CUDA_VISIBLE_DEVICES: "0,1"
       TEST_TYPE: "multi_gpu"
@@ -61,18 +64,19 @@ jobs:
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
-        working-directory: accelerate/
         shell: bash
     steps:
       - name: Update clone
         run: |
           source activate accelerate
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }}
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
           pip install -e . --no-deps
           pip install pytest-reportlog tabulate
 
       - name: Run core and big modeling tests on GPUs
+        working-directory: accelerate
         run: |
           source activate accelerate
           make test_core
@@ -80,12 +84,14 @@ jobs:
           make test_cli
 
       - name: Run Integration tests on GPUs
+        working-directory: accelerate
         if: always()
         run: |
           source activate accelerate
           make test_integrations
 
       - name: Run examples on GPUs
+        working-directory: accelerate
         if: always()
         run: |
           source activate accelerate
@@ -93,6 +99,7 @@ jobs:
           make test_examples
 
       - name: Generate Report
+        working-directory: accelerate
         if: always()
         run: |
           pip install slack_sdk tabulate
diff --git a/.github/workflows/run_merge_tests.yml b/.github/workflows/run_merge_tests.yml
index 7dacab8c508..ef6ea84bc7e 100644
--- a/.github/workflows/run_merge_tests.yml
+++ b/.github/workflows/run_merge_tests.yml
@@ -10,7 +10,7 @@ env:
 
 jobs:
   run_all_tests_single_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
     env:
       CUDA_VISIBLE_DEVICES: "0"
     container:
@@ -18,72 +18,81 @@ jobs:
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
-        working-directory: accelerate/
         shell: bash
     steps:
-      - name: Update clone & pip install
+      - name: Install accelerate
         run: |
-          source activate accelerate
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }}
-          pip install -e .[testing,test_trackers] -U
-          pip install pytest-reportlog tabulate
+          source activate accelerate;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e .[testing,test_trackers] -U;
+          pip install pytest-reportlog tabulate  ;
 
-      - name: Run CLI tests
+      - name: Run CLI tests (use make cli)
+        working-directory: accelerate
         run: |
-          source activate accelerate
+          source activate accelerate;
           make test_cli
           
       - name: Run test on GPUs
+        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate
+          source activate accelerate;
           make test
       - name: Run examples on GPUs
+        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate
-          pip uninstall comet_ml -y
+          source activate accelerate;
+          pip uninstall comet_ml -y;
           make test_examples
 
       - name: Generate Report
+        working-directory: accelerate
         if: always()
         run: |
-          pip install tabulate
+          pip install tabulate;
           python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
   run_all_tests_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
+    env:
+      CUDA_VISIBLE_DEVICES: 0,1
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
-        working-directory: accelerate/
         shell: bash
     steps:
       - name: Update clone
         run: |
-          source activate accelerate
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }}
-          pip install -e .[testing,test_trackers] -U
+          source activate accelerate;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e .[testing,test_trackers] -U;
           pip install pytest-reportlog tabulate
 
       - name: Run test on GPUs
+        working-directory: accelerate
         run: |
-          source activate accelerate
+          source activate accelerate;
           make test
 
       - name: Run examples on GPUs
+        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate
-          pip uninstall comet_ml -y
+          source activate accelerate;
+          pip uninstall comet_ml -y;
           make test_examples
 
       - name: Generate Report
+        working-directory: accelerate
         if: always()
         run: |
-          pip install tabulate
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
\ No newline at end of file
+          source activate accelerate;
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml
index 94e50e61ff3..3c12b51e259 100644
--- a/.github/workflows/self_hosted_integration_tests.yml
+++ b/.github/workflows/self_hosted_integration_tests.yml
@@ -25,7 +25,7 @@ jobs:
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
     strategy:
       fail-fast: false
       matrix:
@@ -34,22 +34,22 @@ jobs:
           "0,1"
         ]
     steps:
-      - name: Update accelerate clone and pip install
-        working-directory: accelerate/
-        run: 
+      - name: Install transformers
+        run: |
           source activate accelerate;
-          git config --global --add safe.directory '*';
-          git checkout main && git fetch && git checkout ${{ github.sha }};
-          pip install -e .;
+          git clone https://github.com/huggingface/transformers --depth 1;
+          cd transformers;
+          pip install .[torch,deepspeed-testing];
+          cd ..;
 
-      - name: Update transformers clone & pip install
-        working-directory: transformers/
+      - name: Install accelerate
         run: |
-          source activate accelerate
-          git config --global --add safe.directory '*'
-          git checkout main && git pull
-          pip install .[torch,deepspeed-testing]
-          pip uninstall comet_ml wandb -y
+          source activate accelerate;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }} ;
+          pip install -e .[testing];
+          cd ..;
       
       - name: Show installed libraries
         run: |
@@ -89,20 +89,20 @@ jobs:
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
     strategy:
       fail-fast: false
     steps:
-      - name: Update accelerate clone and pip install
-        working-directory: accelerate/
+      - name: Install accelerate
         run: 
           source activate accelerate;
-          git config --global --add safe.directory '*';
-          git checkout main && git fetch && git checkout ${{ github.sha }};
-          pip install -e .;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e .[testing];
+          cd ..
 
-      - name: Update skorch clone & pip install
-        working-directory: skorch/
+      - name: Install skorch
         run: |
           source activate accelerate
           git config --global --add safe.directory '*'

From b8ca803f98430048ff479bdc351381871c0074a5 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 20 Nov 2023 13:11:08 -0500
Subject: [PATCH 17/24] Don't make it wait

---
 .github/workflows/build_and_run_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_run_tests.yml b/.github/workflows/build_and_run_tests.yml
index 1f29e189505..fffda0dc921 100644
--- a/.github/workflows/build_and_run_tests.yml
+++ b/.github/workflows/build_and_run_tests.yml
@@ -45,6 +45,6 @@ jobs:
     uses: ./.github/workflows/run_merge_tests.yml
 
   run-integration-tests:
-    needs: run-merge-tests
+    needs: build-docker-containers
     if: always()
     uses: ./.github/workflows/self_hosted_integration_tests.yml
\ No newline at end of file

From 7d430cf8dec31848dc82e3942f4881801c639032 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 20 Nov 2023 13:30:23 -0500
Subject: [PATCH 18/24] skorch

---
 .github/workflows/self_hosted_integration_tests.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml
index 3c12b51e259..ca46fe2e520 100644
--- a/.github/workflows/self_hosted_integration_tests.yml
+++ b/.github/workflows/self_hosted_integration_tests.yml
@@ -105,6 +105,8 @@ jobs:
       - name: Install skorch
         run: |
           source activate accelerate
+          git clone https://github.com/huggingface/skorch;
+          cd skorch;
           git config --global --add safe.directory '*'
           git checkout master && git pull
           pip install .[testing]

From 0e51680994f5b9085de615604661d9022439bd0b Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 20 Nov 2023 14:03:49 -0500
Subject: [PATCH 19/24] Right URL

---
 .github/workflows/self_hosted_integration_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml
index ca46fe2e520..a28469f5193 100644
--- a/.github/workflows/self_hosted_integration_tests.yml
+++ b/.github/workflows/self_hosted_integration_tests.yml
@@ -105,7 +105,7 @@ jobs:
       - name: Install skorch
         run: |
           source activate accelerate
-          git clone https://github.com/huggingface/skorch;
+          git clone https://github.com/skorch-dev/skorch;
           cd skorch;
           git config --global --add safe.directory '*'
           git checkout master && git pull

From 1aeb1e8997bd393c0cc9943752b061e9a4bc67ef Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 21 Nov 2023 08:41:57 -0500
Subject: [PATCH 20/24] Don't make integration tests wait

---
 .github/workflows/nightly.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 3e7f77bf9fd..72450599515 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -107,6 +107,5 @@ jobs:
 
   
   run-integration-tests:
-    needs: [run_all_tests_single_gpu, run_all_tests_multi_gpu]
     if: always()
     uses: ./.github/workflows/self_hosted_integration_tests.yml
\ No newline at end of file

From d25efa71ce76a5f5911a1fc6c039979d7248596f Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 21 Nov 2023 09:54:33 -0500
Subject: [PATCH 21/24] Don't install comet

---
 .github/workflows/self_hosted_integration_tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml
index a28469f5193..42e26bfc1bd 100644
--- a/.github/workflows/self_hosted_integration_tests.yml
+++ b/.github/workflows/self_hosted_integration_tests.yml
@@ -49,6 +49,7 @@ jobs:
           cd accelerate;
           git checkout ${{ github.sha }} ;
           pip install -e .[testing];
+          pip uninstall comet_ml wandb -y
           cd ..;
       
       - name: Show installed libraries

From 244122c736141b164242084c659b6dafa4208fea Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 24 Nov 2023 09:31:57 +0530
Subject: [PATCH 22/24] fsdp refactoring (#2177)

* remove the redundant code post the torch 2.1 release

* make `use_orig_params=True` by default.

* fix `save_state` optimizer saving for fsdp and update the fsdp example

* quality

* fixing the utils and tests. Updating the docs

* bump up the minimum version for FSDP support.

* address comment

* rename fsdp model checkpointing variables
---
 docs/source/usage_guides/fsdp.md              | 82 ++++-------------
 .../by_feature/fsdp_with_peak_mem_tracking.py | 32 +++----
 src/accelerate/accelerator.py                 | 92 ++-----------------
 src/accelerate/commands/config/cluster.py     |  4 +-
 src/accelerate/commands/launch.py             |  2 +-
 src/accelerate/utils/constants.py             |  3 +-
 src/accelerate/utils/dataclasses.py           |  2 +-
 src/accelerate/utils/fsdp_utils.py            | 38 ++++----
 tests/fsdp/test_fsdp.py                       |  5 +
 9 files changed, 70 insertions(+), 190 deletions(-)

diff --git a/docs/source/usage_guides/fsdp.md b/docs/source/usage_guides/fsdp.md
index a57a4bf6801..96385a38178 100644
--- a/docs/source/usage_guides/fsdp.md
+++ b/docs/source/usage_guides/fsdp.md
@@ -40,23 +40,30 @@ For instance, here is how you would run the NLP example (from the root of the re
 
 ```bash
 compute_environment: LOCAL_MACHINE
-deepspeed_config: {}
+debug: false
 distributed_type: FSDP
 downcast_bf16: 'no'
 fsdp_config:
   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
   fsdp_backward_prefetch_policy: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
   fsdp_offload_params: false
   fsdp_sharding_strategy: 1
-  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
   fsdp_transformer_layer_cls_to_wrap: BertLayer
+  fsdp_use_orig_params: true
 machine_rank: 0
-main_process_ip: null
-main_process_port: null
 main_training_function: main
-mixed_precision: 'no'
+mixed_precision: bf16
 num_machines: 1
 num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
 use_cpu: false
 ```
 
@@ -66,7 +73,7 @@ accelerate launch examples/nlp_example.py
 
 Currently, `Accelerate` supports the following config through the CLI:
 
-```bash
+
 `Sharding Strategy`: [1] FULL_SHARD (shards optimizer states, gradients and parameters), [2] SHARD_GRAD_OP (shards optimizer states and gradients), [3] NO_SHARD (DDP), [4] HYBRID_SHARD (shards optimizer states, gradients and parameters within each node while each node has full copy), [5] HYBRID_SHARD_ZERO2 (shards optimizer states and gradients within each node while each node has full copy)
 
 `Offload Params`: Decides Whether to offload parameters and gradients to CPU
@@ -94,12 +101,12 @@ all-gather while executing in the forward pass. only use with Static graphs.
 
 `Use Orig Params`: If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres. 
 Useful in cases such as parameter-efficient fine-tuning. 
-Please refer this [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019)
+Please refer this [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019). This also enables to have different optimizer param groups. This should be `True` when creating optimizer object before preparing/wrapping the model with FSDP.
 
 `CPU RAM Efficient Model loading`: If True, only the first process loads the pretrained model checkoint while all other processes have empty weights. Only applicable for 🤗 Transformers models. This should be set to False if you experience errors when loading the pretrained 🤗 Transformers model via `from_pretrained` method. When using this, `Sync Module States` needs to be True else all the processes expect the main process would have random empty weights leading to unexpected behaviour during training.
 
 `Sync Module States`: If True, each individually wrapped FSDP unit will broadcast module parameters from rank 0
-```
+
 
 For additional and more nuanced control, you can specify other FSDP parameters via `FullyShardedDataParallelPlugin`. 
 When creating `FullyShardedDataParallelPlugin` object, pass it the parameters that weren't part of the accelerate config or if you want to override them.
@@ -156,72 +163,19 @@ When using transformers `save_pretrained`, pass `state_dict=accelerator.get_stat
       args.output_dir,
       is_main_process=accelerator.is_main_process,
       save_function=accelerator.save,
-+     state_dict=accelerator.get_state_dict(model, unwrap=False),
++     state_dict=accelerator.get_state_dict(model),
 )
 ```
 
 ### State Dict
 
-`accelerator.get_state_dict` will call the underlying `model.state_dict` implementation.  With a model wrapped by FSDP, the default behavior of `state_dict` is to gather all of the state in the rank 0 device.  This can cause CUDA out of memory errors if the parameters don't fit on a single GPU.
-
-To avoid this, PyTorch provides a context manager that adjusts the behavior of `state_dict`.  To offload some of the state dict onto CPU, you can use the following code:
-
-```
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType, FullStateDictConfig
-
-full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
-with FSDP.state_dict_type(unwrapped_model, StateDictType.FULL_STATE_DICT, full_state_dict_config):
-    state = accelerator.get_state_dict(unwrapped_model)
-```
+`accelerator.get_state_dict` will call the underlying `model.state_dict` implementation using `FullStateDictConfig(offload_to_cpu=True, rank0_only=True)` context manager to get the state dict only for rank 0 and it will be offloaded to CPU.
 
 You can then pass `state` into the `save_pretrained` method.  There are several modes for `StateDictType` and `FullStateDictConfig` that you can use to control the behavior of `state_dict`.  For more information, see the [PyTorch documentation](https://pytorch.org/docs/stable/fsdp.html).
 
 ## A few caveats to be aware of
 
-- PyTorch FSDP auto wraps sub-modules, flattens the parameters and shards the parameters in place.
-  Due to this, any optimizer created before model wrapping gets broken and occupies more memory.
-  Hence, it is highly recommended and efficient to prepare the model before creating the optimizer.
-  `Accelerate` will automatically wrap the model and create an optimizer for you in case of single model with a warning message.
-  > FSDP Warning: When using FSDP, it is efficient and recommended to call prepare for the model before creating the optimizer
-
-However, below is the recommended way to prepare model and optimizer while using FSDP:
-
-```diff
-  model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
-+ model = accelerator.prepare(model)
-
-  optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
-
-- model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
--        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
--    )
-
-+ optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
-+         optimizer, train_dataloader, eval_dataloader, lr_scheduler
-+    )
-```
-
-- In case of a single model, if you have created the optimizer with multiple parameter groups and called prepare with them together,
-  then the parameter groups will be lost and the following warning is displayed:
-  > FSDP Warning: When using FSDP, several parameter groups will be conflated into
-  > a single one due to nested module wrapping and parameter flattening.
-  
-  This is because parameter groups created before wrapping will have no meaning post wrapping due to parameter flattening of nested FSDP modules into 1D arrays (which can consume many layers).
-  For instance, below are the named parameters of an FSDP model on GPU 0 (When using 2 GPUs. Around 55M (110M/2) params in 1D arrays as this will have the 1st shard of the parameters). 
-  Here, if one has applied no weight decay for [bias, LayerNorm.weight] the named parameters of an unwrapped BERT model, 
-  it can't be applied to the below FSDP wrapped model as there are no named parameters with either of those strings and 
-  the parameters of those layers are concatenated with parameters of various other layers.
-  ```
-  {
-    '_fsdp_wrapped_module.flat_param': torch.Size([494209]), 
-    '_fsdp_wrapped_module._fpw_module.bert.embeddings.word_embeddings._fsdp_wrapped_module.flat_param': torch.Size([11720448]), 
-    '_fsdp_wrapped_module._fpw_module.bert.encoder._fsdp_wrapped_module.flat_param': torch.Size([42527232])
-  }
-  ```
-
-
-- In case of multiple models, it is necessary to prepare the models before creating optimizers or else it will throw an error. 
-Then pass the optimizers to the prepare call in the same order as corresponding models else `accelerator.save_state()` and `accelerator.load_state()` will result in wrong/unexpected behaviour.
+- In case of multiple models, pass the optimizers to the prepare call in the same order as corresponding models else `accelerator.save_state()` and `accelerator.load_state()` will result in wrong/unexpected behaviour.
 - This feature is incompatible with `--predict_with_generate` in the `run_translation.py` script of 🤗 `Transformers` library.
 
 For more control, users can leverage the `FullyShardedDataParallelPlugin`. After creating an instance of this class, users can pass it to the Accelerator class instantiation.
diff --git a/examples/by_feature/fsdp_with_peak_mem_tracking.py b/examples/by_feature/fsdp_with_peak_mem_tracking.py
index 8abe3278953..22c87ada540 100644
--- a/examples/by_feature/fsdp_with_peak_mem_tracking.py
+++ b/examples/by_feature/fsdp_with_peak_mem_tracking.py
@@ -247,16 +247,19 @@ def collate_fn(examples):
         args.model_name_or_path, return_dict=True, low_cpu_mem_usage=True
     )
 
-    # New Code #
-    # For FSDP feature, it is highly recommended and efficient to prepare the model before creating optimizer
-    model = accelerator.prepare(model)
-    accelerator.print(model)
-
-    # Instantiate optimizer
-    # New Code #
-    # For FSDP feature, at present it doesn't support multiple parameter groups,
-    # so we need to create a single parameter group for the whole model
-    optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr, weight_decay=2e-4)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": 0.003,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+
+    optimizer = torch.optim.AdamW(params=optimizer_grouped_parameters, lr=lr, weight_decay=2e-4)
 
     # Instantiate scheduler
     lr_scheduler = get_linear_schedule_with_warmup(
@@ -265,13 +268,8 @@ def collate_fn(examples):
         num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
     )
 
-    # New Code #
-    # For FSDP feature, prepare everything except the model as we have already prepared the model
-    # before creating the optimizer
-    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
-    # prepare method.
-    optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
-        optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
     )
 
     overall_step = 0
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index c464cbd6f28..55aef1a366b 100755
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -1100,52 +1100,6 @@ def _prepare_one(self, obj, first_pass=False, device_placement=None):
         # Return the unprocessed object if previous criteria was not met
         return obj
 
-    def _prepare_fsdp(self, *args):
-        result = []
-        for obj in args:
-            if isinstance(obj, torch.nn.Module):
-                model = obj
-                break
-        optimizers = []
-
-        self._schedulers = []
-        self._models = []
-        intermediate_result = []
-        for obj in args:
-            if isinstance(obj, torch.optim.Optimizer):
-                if len(obj.param_groups) > 1:
-                    logger.warning(
-                        "FSDP Warning: When using FSDP, several parameter groups will be conflated into "
-                        "a single one due to nested module wrapping and parameter flattening."
-                    )
-                try:
-                    optimizer = obj.optimizer.__class__(model.parameters(), **obj.optimizer.defaults)
-                except TypeError:
-                    if "differentiable" in obj.optimizer.defaults:
-                        # https://github.com/huggingface/accelerate/issues/801
-                        defaults = {k: v for k, v in obj.optimizer.defaults.items() if k != "differentiable"}
-                        optimizer = obj.optimizer.__class__(model.parameters(), **defaults)
-                    else:
-                        raise
-                obj = self.prepare_optimizer(optimizer)
-                optimizers.append(obj)
-            elif isinstance(obj, torch.nn.Module):
-                self._models.append(obj)
-            intermediate_result.append(obj)
-
-        for obj in intermediate_result:
-            if isinstance(obj, AcceleratedScheduler):
-                obj.optimizer = optimizers
-                for i, opt in enumerate(self._optimizers):
-                    if getattr(obj.scheduler, "optimizer", None) == opt.optimizer:
-                        obj.scheduler.optimizer = optimizers[i]
-                        obj.optimizers = [optimizers[i]]
-                        break
-                self._schedulers.append(obj)
-            result.append(obj)
-        self._optimizers = optimizers
-        return tuple(result)
-
     def prepare(self, *args, device_placement=None):
         """
         Prepare all objects passed in `args` for distributed training and mixed precision, then return them in the same
@@ -1214,35 +1168,6 @@ def prepare(self, *args, device_placement=None):
                     " Please rerun your script specifying `--num_processes=1` or by launching with `python {{myscript.py}}`."
                 )
 
-        if self.distributed_type == DistributedType.FSDP:
-            from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
-
-            model_count = 0
-            optimizer_present = False
-            is_type_fsdp = False
-            for obj in args:
-                if isinstance(obj, torch.nn.Module):
-                    model_count += 1
-                    # if the model is compiled using PyTorch 2.0,
-                    # check that the wrapped model is FSDP or not;
-                    # else check if it is FSDP or not;
-                    is_type_fsdp = isinstance(obj, FSDP) or (
-                        is_compiled_module(obj) and isinstance(obj._orig_mod, FSDP)
-                    )
-                if isinstance(obj, torch.optim.Optimizer):
-                    optimizer_present = True
-            if model_count > 1 and optimizer_present:
-                raise ValueError(
-                    "For FSDP to work with multiple models (>1), "
-                    "prepare must be called for all the models before optimizers are created. "
-                    "Then pass the optimizers to the prepare call in the same order as corresponding models."
-                )
-            elif model_count == 1 and not is_type_fsdp and optimizer_present:
-                logger.warning(
-                    "FSDP Warning: When using FSDP, "
-                    "it is efficient and recommended to call prepare for the model before creating the optimizer"
-                )
-
         if self.distributed_type == DistributedType.DEEPSPEED:
             model_count = 0
             for obj in args:
@@ -1298,14 +1223,6 @@ def prepare(self, *args, device_placement=None):
                 if isinstance(obj, torch.optim.Optimizer):
                     obj._switch_parameters(mapping)
 
-        if (
-            self.distributed_type == DistributedType.FSDP
-            and model_count == 1
-            and not is_type_fsdp
-            and optimizer_present
-        ):
-            result = self._prepare_fsdp(*result)
-
         for item in result:
             if any(
                 item in container
@@ -2753,7 +2670,7 @@ def _inner(folder):
         # Save the optimizers taking care of FSDP and DeepSpeed nuances
         optimizers = []
         if self.distributed_type == DistributedType.FSDP:
-            for opt in self._optimizers:
+            for i, opt in enumerate(self._optimizers):
                 logger.info("Saving FSDP Optimizer")
                 save_fsdp_optimizer(self.state.fsdp_plugin, self, opt, self._models[i], output_dir, i)
                 logger.info(f"FSDP Optimizer saved to output dir {output_dir}")
@@ -3068,6 +2985,13 @@ def get_state_dict(self, model, unwrap=True):
                 from deepspeed.checkpoint.utils import clone_tensors_for_torch_save
 
                 state_dict = clone_tensors_for_torch_save(self.unwrap_model(model).state_dict())
+        elif self.distributed_type == DistributedType.FSDP:
+            from torch.distributed.fsdp import FullStateDictConfig, StateDictType
+            from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+            full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+            with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, full_state_dict_config):
+                state_dict = model.state_dict()
         else:
             if unwrap:
                 model = self.unwrap_model(model)
diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
index 1331e7fe43c..85d13d19cc5 100644
--- a/src/accelerate/commands/config/cluster.py
+++ b/src/accelerate/commands/config/cluster.py
@@ -381,9 +381,9 @@ def get_cluster_input():
                 error_message="Please enter yes or no.",
             )
             fsdp_config["fsdp_use_orig_params"] = _ask_field(
-                "Do you want to enable FSDP's `use_orig_params` feature? [yes/NO]: ",
+                "Do you want to enable FSDP's `use_orig_params` feature? [YES/no]: ",
                 _convert_yes_no_to_bool,
-                default=False,
+                default=True,
                 error_message="Please enter yes or no.",
             )
             fsdp_config["fsdp_cpu_ram_efficient_loading"] = _ask_field(
diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index 2dfc4fdb7ee..8e44919b23d 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -519,7 +519,7 @@ def launch_command_parser(subparsers=None):
     )
     fsdp_args.add_argument(
         "--fsdp_use_orig_params",
-        default="false",
+        default="true",
         type=str,
         help="If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres."
         " (useful only when `use_fsdp` flag is passed).",
diff --git a/src/accelerate/utils/constants.py b/src/accelerate/utils/constants.py
index 843eb5756af..c17487ade01 100644
--- a/src/accelerate/utils/constants.py
+++ b/src/accelerate/utils/constants.py
@@ -34,7 +34,8 @@
 FSDP_AUTO_WRAP_POLICY = ["TRANSFORMER_BASED_WRAP", "SIZE_BASED_WRAP", "NO_WRAP"]
 FSDP_BACKWARD_PREFETCH = ["BACKWARD_PRE", "BACKWARD_POST", "NO_PREFETCH"]
 FSDP_STATE_DICT_TYPE = ["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"]
-FSDP_PYTORCH_VERSION = "2.0.1"
+FSDP_PYTORCH_VERSION = "2.1.0"
+FSDP_MODEL_NAME = "pytorch_model_fsdp"
 DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich", "mpich"]
 TORCH_DYNAMO_MODES = ["default", "reduce-overhead", "max-autotune"]
 
diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
index 2d22d460aee..6bc51c399e3 100644
--- a/src/accelerate/utils/dataclasses.py
+++ b/src/accelerate/utils/dataclasses.py
@@ -870,7 +870,7 @@ class FullyShardedDataParallelPlugin:
         },
     )
     use_orig_params: bool = field(
-        default=False,
+        default=True,
         metadata={
             "help": "If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres. "
             "Useful in cases such as parameter-efficient fine-tuning. "
diff --git a/src/accelerate/utils/fsdp_utils.py b/src/accelerate/utils/fsdp_utils.py
index 827b9ffd99c..edff9dec604 100644
--- a/src/accelerate/utils/fsdp_utils.py
+++ b/src/accelerate/utils/fsdp_utils.py
@@ -16,7 +16,7 @@
 import torch
 
 from ..logging import get_logger
-from .constants import FSDP_PYTORCH_VERSION, MODEL_NAME, OPTIMIZER_NAME
+from .constants import FSDP_MODEL_NAME, FSDP_PYTORCH_VERSION, OPTIMIZER_NAME
 from .imports import is_torch_distributed_available
 from .versions import is_torch_version
 
@@ -47,7 +47,7 @@ def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0):
     ):
         state_dict = model.state_dict()
         if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
-            weights_name = f"{MODEL_NAME}.bin" if model_index == 0 else f"{MODEL_NAME}_{model_index}.bin"
+            weights_name = f"{FSDP_MODEL_NAME}.bin" if model_index == 0 else f"{FSDP_MODEL_NAME}_{model_index}.bin"
             output_model_file = os.path.join(output_dir, weights_name)
             if accelerator.process_index == 0:
                 logger.info(f"Saving model to {output_model_file}")
@@ -55,16 +55,16 @@ def save_fsdp_model(fsdp_plugin, accelerator, model, output_dir, model_index=0):
                 logger.info(f"Model saved to {output_model_file}")
         elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT:
             weights_name = (
-                f"{MODEL_NAME}_rank{accelerator.process_index}.bin"
+                f"{FSDP_MODEL_NAME}_rank{accelerator.process_index}.bin"
                 if model_index == 0
-                else f"{MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin"
+                else f"{FSDP_MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin"
             )
             output_model_file = os.path.join(output_dir, weights_name)
             logger.info(f"Saving model to {output_model_file}")
             torch.save(state_dict, output_model_file)
             logger.info(f"Model saved to {output_model_file}")
         elif fsdp_plugin.state_dict_type == StateDictType.SHARDED_STATE_DICT:
-            ckpt_dir = os.path.join(output_dir, f"{MODEL_NAME}_{model_index}")
+            ckpt_dir = os.path.join(output_dir, f"{FSDP_MODEL_NAME}_{model_index}")
             os.makedirs(ckpt_dir, exist_ok=True)
             logger.info(f"Saving model to {ckpt_dir}")
             state_dict = {"model": state_dict}
@@ -96,16 +96,16 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0):
                         "initializing FSDP object"
                     )
                 return
-            weights_name = f"{MODEL_NAME}.bin" if model_index == 0 else f"{MODEL_NAME}_{model_index}.bin"
+            weights_name = f"{FSDP_MODEL_NAME}.bin" if model_index == 0 else f"{FSDP_MODEL_NAME}_{model_index}.bin"
             input_model_file = os.path.join(input_dir, weights_name)
             logger.info(f"Loading model from {input_model_file}")
             state_dict = torch.load(input_model_file)
             logger.info(f"Model loaded from {input_model_file}")
         elif fsdp_plugin.state_dict_type == StateDictType.LOCAL_STATE_DICT:
             weights_name = (
-                f"{MODEL_NAME}_rank{accelerator.process_index}.bin"
+                f"{FSDP_MODEL_NAME}_rank{accelerator.process_index}.bin"
                 if model_index == 0
-                else f"{MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin"
+                else f"{FSDP_MODEL_NAME}_{model_index}_rank{accelerator.process_index}.bin"
             )
             input_model_file = os.path.join(input_dir, weights_name)
             logger.info(f"Loading model from {input_model_file}")
@@ -113,8 +113,8 @@ def load_fsdp_model(fsdp_plugin, accelerator, model, input_dir, model_index=0):
             logger.info(f"Model loaded from {input_model_file}")
         elif fsdp_plugin.state_dict_type == StateDictType.SHARDED_STATE_DICT:
             ckpt_dir = (
-                os.path.join(input_dir, f"{MODEL_NAME}_{model_index}")
-                if f"{MODEL_NAME}" not in input_dir
+                os.path.join(input_dir, f"{FSDP_MODEL_NAME}_{model_index}")
+                if f"{FSDP_MODEL_NAME}" not in input_dir
                 else input_dir
             )
             logger.info(f"Loading model from {ckpt_dir}")
@@ -164,16 +164,14 @@ def load_fsdp_optimizer(fsdp_plugin, accelerator, optimizer, model, input_dir, o
     ):
         if fsdp_plugin.state_dict_type == StateDictType.FULL_STATE_DICT:
             optim_state = None
-            # below check should work but currently it isn't working (mostly opytorch issue),
-            # in the meantime disabling it at the cost of excess memory usage
-            # if accelerator.process_index == 0 or not fsdp_plugin.optim_state_dict_config.rank0_only:
-            optimizer_name = (
-                f"{OPTIMIZER_NAME}.bin" if optimizer_index == 0 else f"{OPTIMIZER_NAME}_{optimizer_index}.bin"
-            )
-            input_optimizer_file = os.path.join(input_dir, optimizer_name)
-            logger.info(f"Loading Optimizer state from {input_optimizer_file}")
-            optim_state = torch.load(input_optimizer_file)
-            logger.info(f"Optimizer state loaded from {input_optimizer_file}")
+            if accelerator.process_index == 0 or not fsdp_plugin.optim_state_dict_config.rank0_only:
+                optimizer_name = (
+                    f"{OPTIMIZER_NAME}.bin" if optimizer_index == 0 else f"{OPTIMIZER_NAME}_{optimizer_index}.bin"
+                )
+                input_optimizer_file = os.path.join(input_dir, optimizer_name)
+                logger.info(f"Loading Optimizer state from {input_optimizer_file}")
+                optim_state = torch.load(input_optimizer_file)
+                logger.info(f"Optimizer state loaded from {input_optimizer_file}")
         else:
             ckpt_dir = (
                 os.path.join(input_dir, f"{OPTIMIZER_NAME}_{optimizer_index}")
diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
index 7b87f61f471..244bedf4d82 100644
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@@ -252,6 +252,11 @@ def test_checkpointing(self):
                 continue
             state_dict_config_index = len(cmd_config)
             for state_dict_type in FSDP_STATE_DICT_TYPE:
+                # Todo: Currently failing for `LOCAL_STATE_DICT` with error
+                # Unexpected key(s) in state_dict: "_fsdp_wrapped_module._flat_param".
+                if state_dict_type == "LOCAL_STATE_DICT":
+                    continue
+
                 cmd_config = cmd_config[:state_dict_config_index]
                 cmd_config.append(f"--fsdp_state_dict_type={state_dict_type}")
                 cmd_config.extend(

From 5fc1b230d339c6e77179adfe2b74a6b414c9cbbf Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 28 Nov 2023 13:34:11 -0500
Subject: [PATCH 23/24] Pin DVC (#2196)

* Remove dvc

* Pin instead
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b3a8fda47bf..f7369d7df97 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@
 extras["testing"] = extras["test_prod"] + extras["test_dev"]
 extras["rich"] = ["rich"]
 
-extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard", "dvclive"]
+extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard", "dvclive", "dvc<=3.30.1"]
 extras["dev"] = extras["quality"] + extras["testing"] + extras["rich"]
 
 extras["sagemaker"] = [

From b04d36c75f701266048382426b4074e28bfdb67c Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 28 Nov 2023 15:02:20 -0500
Subject: [PATCH 24/24] Apply DVC warning to Accelerate (#2197)

* Use logger warn instead

* Warn

* Right import

* Clean up logs

* Apply suggestions from code review

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

---------

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 src/accelerate/logging.py  | 12 ++++++++++++
 src/accelerate/tracking.py | 20 +++++++++++++++-----
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/src/accelerate/logging.py b/src/accelerate/logging.py
index d553b9a993c..ebb8c1eb830 100644
--- a/src/accelerate/logging.py
+++ b/src/accelerate/logging.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import functools
 import logging
 import os
 
@@ -67,6 +68,17 @@ def log(self, level, msg, *args, **kwargs):
                         self.logger.log(level, msg, *args, **kwargs)
                     state.wait_for_everyone()
 
+    @functools.lru_cache(None)
+    def warning_once(self, *args, **kwargs):
+        """
+        This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+
+        Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the
+        cache. The assumption here is that all warning messages are unique across the code. If they aren't then need to
+        switch to another type of cache that includes the caller frame information in the hashing function.
+        """
+        self.warning(*args, **kwargs)
+
 
 def get_logger(name: str, log_level: str = None):
     """
diff --git a/src/accelerate/tracking.py b/src/accelerate/tracking.py
index 711f616b73e..7276f552aaf 100644
--- a/src/accelerate/tracking.py
+++ b/src/accelerate/tracking.py
@@ -640,8 +640,8 @@ def store_init_configuration(self, values: dict):
         for name, value in list(values.items()):
             # internally, all values are converted to str in MLflow
             if len(str(value)) > mlflow.utils.validation.MAX_PARAM_VAL_LENGTH:
-                logger.warning(
-                    f'Trainer is attempting to log a value of "{value}" for key "{name}" as a parameter. MLflow\'s'
+                logger.warning_once(
+                    f'Accelerate is attempting to log a value of "{value}" for key "{name}" as a parameter. MLflow\'s'
                     f" log_param() only accepts values no longer than {mlflow.utils.validation.MAX_PARAM_VAL_LENGTH} characters so we dropped this attribute."
                 )
                 del values[name]
@@ -670,7 +670,7 @@ def log(self, values: dict, step: Optional[int]):
             if isinstance(v, (int, float)):
                 metrics[k] = v
             else:
-                logger.warning(
+                logger.warning_once(
                     f'MLflowTracker is attempting to log a value of "{v}" of type {type(v)} for key "{k}" as a metric. '
                     "MLflow's log_metric() only accepts float and int types so we dropped this attribute."
                 )
@@ -755,7 +755,7 @@ def log(self, values: Dict[str, Union[int, float]], step: Optional[int] = None,
         clearml_logger = self.task.get_logger()
         for k, v in values.items():
             if not isinstance(v, (int, float)):
-                logger.warning(
+                logger.warning_once(
                     "Accelerator is attempting to log a value of "
                     f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
                     "This invocation of ClearML logger's  report_scalar() "
@@ -901,10 +901,20 @@ def log(self, values: dict, step: Optional[int] = None, **kwargs):
             kwargs:
                 Additional key word arguments passed along to `dvclive.Live.log_metric()`.
         """
+        from dvclive.plots import Metric
+
         if step is not None:
             self.live.step = step
         for k, v in values.items():
-            self.live.log_metric(k, v, **kwargs)
+            if Metric.could_log(v):
+                self.live.log_metric(k, v, **kwargs)
+            else:
+                logger.warning_once(
+                    "Accelerator attempted to log a value of "
+                    f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
+                    "This invocation of DVCLive's Live.log_metric() "
+                    "is incorrect so we dropped this attribute."
+                )
 
     @on_main_process
     def finish(self):