From c47e012fd3ec79eed50b5842a56acd67ac8ef220 Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Wed, 14 Feb 2024 11:47:05 -0800
Subject: [PATCH 01/28] Use testing repo actions (#2948)

---
 .github/workflows/code-quality.yaml    | 57 +++++++++++++++++---------
 .github/workflows/codeql-analysis.yml  | 47 +++++++--------------
 .github/workflows/coverage.yaml        | 37 +++++++++--------
 .github/workflows/daily.yaml           |  3 ++
 .github/workflows/pr-code-quality.yaml | 28 -------------
 .github/workflows/pr-cpu.yaml          |  3 ++
 .github/workflows/smoketest.yaml       | 29 +++++++------
 7 files changed, 96 insertions(+), 108 deletions(-)
 delete mode 100644 .github/workflows/pr-code-quality.yaml

diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
index 20bbf327b7..e3400b81b4 100644
--- a/.github/workflows/code-quality.yaml
+++ b/.github/workflows/code-quality.yaml
@@ -1,13 +1,17 @@
 name: Code Quality Checks
 on:
-  workflow_call:
-    inputs:
-      python_version:
-        required: true
-        type: string
-      pip_deps:
-        required: true
-        type: string
+  push:
+    branches:
+    - dev
+    - main
+    - release/**
+  pull_request:
+  workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main
+# or dev
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 defaults:
   run:
     working-directory: .
@@ -15,16 +19,31 @@ jobs:
   code-quality:
     runs-on: ubuntu-20.04
     timeout-minutes: 15
+    strategy:
+      matrix:
+        python_version:
+        - "3.9"
+        - "3.10"
+        - "3.11"
+        pip_deps:
+        - "[dev]"
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v4
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Get Repo Token
+      id: REPO_TOKEN
+      uses: tibdex/github-app-token@v1
       with:
-        python-version: ${{ inputs.python_version }}
-    - name: Setup
-      run: |
-        set -ex
-        python -m pip install --upgrade 'pip<23' wheel
-        python -m pip install --upgrade .${{ inputs.pip_deps }}
-    - name: Run checks
-      run: |
-        pre-commit run --all-files
+        app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
+        private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
+    - name: Get composite run steps repository
+      uses: actions/checkout@v3
+      with:
+        repository: mosaicml/testing
+        ref: v0.0.2
+        path: ./testing
+        token: ${{ steps.REPO_TOKEN.outputs.token }}
+    - uses: ./testing/.github/actions/code-quality
+      with:
+        python_version: ${{ matrix.python_version }}
+        pip_deps: ${{ matrix.pip_deps }}
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 151179d524..5a4ddb477c 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -41,36 +41,19 @@ jobs:
     steps:
     - name: Checkout repository
       uses: actions/checkout@v2
-
-    # Initializes the CodeQL tools for scanning.
-    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
+    - name: Get Repo Token
+      id: REPO_TOKEN
+      uses: tibdex/github-app-token@v1
       with:
-        languages: ${{ matrix.language }}
-        # If you wish to specify custom queries, you can do so here or in a
-        # config file.
-        # By default, queries listed here will override any specified in a
-        # config file.
-        # Prefix the list here with "+" to use these queries and those in the
-        # config file.
-        # queries: ./path/to/local/query, your-org/your-repo/queries@main
-
-    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
-    # If this step fails, then you should remove it and run the build manually
-    # (see below)
-    - name: Autobuild
-      uses: github/codeql-action/autobuild@v2
-
-    # ℹ️ Command-line programs to run using the OS shell.
-    # 📚 https://git.io/JvXDl
-
-    # ✏️ If the Autobuild fails above, remove it and uncomment the following
-    #    three lines and modify them (or add more) to build your code if your
-    #    project uses a compiled language
-
-    # - run: |
-    #   make bootstrap
-    #   make release
-
-    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
+        app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
+        private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
+    - name: Get composite run steps repository
+      uses: actions/checkout@v3
+      with:
+        repository: mosaicml/testing
+        ref: v0.0.2
+        path: ./testing
+        token: ${{ steps.REPO_TOKEN.outputs.token }}
+    - uses: ./testing/.github/actions/codeql-analysis
+      with:
+        language: ${{ matrix.language }}
diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
index f89d67ec39..fe69f936ab 100644
--- a/.github/workflows/coverage.yaml
+++ b/.github/workflows/coverage.yaml
@@ -5,6 +5,11 @@ on:
       download-path:
         required: true
         type: string
+    secrets:
+      app_id:
+        required: true
+      private_key:
+        required: true
 jobs:
   coverage:
     timeout-minutes: 5
@@ -12,21 +17,19 @@ jobs:
     steps:
     - name: Checkout Repo
       uses: actions/checkout@v3
-    - name: Setup
-      run: |
-        set -ex
-        python -m pip install --upgrade 'pip<23' wheel
-        pip install coverage[toml]==6.5.0
-    - name: Download artifacts
-      uses: actions/download-artifact@v3
+    - name: Get Repo Token
+      id: REPO_TOKEN
+      uses: tibdex/github-app-token@v1
+      with:
+        app_id: ${{ secrets.app_id }}
+        private_key: ${{ secrets.private_key }}
+    - name: Get composite run steps repository
+      uses: actions/checkout@v3
+      with:
+        repository: mosaicml/testing
+        ref: v0.0.2
+        path: ./testing
+        token: ${{ steps.REPO_TOKEN.outputs.token }}
+    - uses: ./testing/.github/actions/coverage
       with:
-        path: ${{ inputs.download-path }}
-    - name: Generate coverage report
-      run: |
-        set -ex
-
-        # Flatten the coverage files
-        ls ${{ inputs.download-path }} | while read x; do mv ${{ inputs.download-path }}/$x/.coverage .coverage.$x; done
-
-        python -m coverage combine
-        python -m coverage report
+        download-path: ${{ inputs.download-path }}
diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index 3c65b0f4fa..3867a75b71 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -97,6 +97,9 @@ jobs:
     needs: [daily-pytest-cpu]
     with:
       download-path: artifacts
+    secrets:
+      app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
+      private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
 
   daily-pytest-gpu:
     uses: ./.github/workflows/pytest-gpu.yaml
diff --git a/.github/workflows/pr-code-quality.yaml b/.github/workflows/pr-code-quality.yaml
deleted file mode 100644
index 26d2546e75..0000000000
--- a/.github/workflows/pr-code-quality.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: PR Code Quality Checks
-on:
-  push:
-    branches:
-    - dev
-    - main
-    - release/**
-  pull_request:
-  workflow_dispatch:
-# Cancel old runs when a new commit is pushed to the same branch if not on main
-# or dev
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
-jobs:
-  code-quality:
-    uses: ./.github/workflows/code-quality.yaml
-    strategy:
-      matrix:
-        python_version:
-        - "3.9"
-        - "3.10"
-        - "3.11"
-        pip_deps:
-        - "[dev]"
-    with:
-      python_version: ${{ matrix.python_version }}
-      pip_deps: ${{ matrix.pip_deps }}
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 6eee54cb0b..c17dd32f0a 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -43,3 +43,6 @@ jobs:
     needs: [pytest-cpu]
     with:
       download-path: artifacts
+    secrets:
+      app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
+      private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
index 429cc40b1d..7d30a7b0e5 100644
--- a/.github/workflows/smoketest.yaml
+++ b/.github/workflows/smoketest.yaml
@@ -27,16 +27,21 @@ jobs:
         - "3.10"
         - "3.11"
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v4
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Get Repo Token
+      id: REPO_TOKEN
+      uses: tibdex/github-app-token@v1
       with:
-        python-version: ${{ matrix.python_version }}
-    - name: Setup
-      run: |
-        set -ex
-        python -m pip install --upgrade 'pip<23' wheel
-        python -m pip install --upgrade .
-        python -m pip install pytest==7.2.1 pytest_codeblocks==0.16.1
-    - name: Run checks
-      run: |
-        pytest tests/test_smoketest.py
+        app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
+        private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
+    - name: Get composite run steps repository
+      uses: actions/checkout@v3
+      with:
+        repository: mosaicml/testing
+        ref: v0.0.2
+        path: ./testing
+        token: ${{ steps.REPO_TOKEN.outputs.token }}
+    - uses: ./testing/.github/actions/smoketest
+      with:
+        python_version: ${{ matrix.python_version }}

From 6a5972f8713611e2d36513e148cc3542b30b0873 Mon Sep 17 00:00:00 2001
From: Jose Javier <26491792+josejg@users.noreply.github.com>
Date: Thu, 15 Feb 2024 08:27:34 -0800
Subject: [PATCH 02/28] Make CodeEval respect device_eval_batch_size (#2969)

* Make CodeEval respect device_eval_batch_size

* fix

* fix

* Avoid materializing dataset in memory

* fix

* fix

* Helper func

* fix

* Remove todo

* fix documentation

* use pre-commit run

* Tests

* fix error msg

* left padding

* final touches

* Fix gpu test

* Tests

* Return tensor

* fix

* pyright

* fix pyright

* Allow subset batches

* pyright

---------

Co-authored-by: Eitan Turok <eitan.turok@databricks.com>
Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 .../in_context_learning_evaluation.py         | 345 +++++++++++-------
 composer/metrics/nlp.py                       | 106 +++---
 .../test_in_context_learning_datasets.py      | 185 +++++-----
 tests/metrics/test_nlp_metrics.py             |  27 +-
 4 files changed, 394 insertions(+), 269 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 2fc75cf899..459487f158 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -47,7 +47,7 @@ def strip_data(example: Dict) -> Dict:
     return {k: v.strip() if isinstance(v, str) else v for k, v in example.items()}
 
 
-def _tokenizer_needs_prefix_space(tokenizer: transformers.PreTrainedTokenizerBase) -> bool:
+def _tokenizer_needs_prefix_space(tokenizer: transformers.PreTrainedTokenizerBase,) -> bool:
     """
     Test for whether a prefix space is needed before the continuation.
     Sentencepiece tokenization should not have a prefix space, but gpt2 style BPE should.
@@ -82,7 +82,7 @@ def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -
 
         if context_max_subseq_len < 0:
             # can't support continuations which are longer than the max seq len
-            raise Exception(f'Dataset included continuation longer than the max seq len')
+            raise Exception(f'Dataset included continuation longer than the max seq len of {max_seq_len}')
 
         # clip from the end
         context_enc = context_enc[-(context_max_subseq_len):]
@@ -103,11 +103,13 @@ def _get_continuation_span(context_enc: List, continuation_enc: List) -> torch.T
     return torch.tensor(range(len(context_enc), len(context_enc) + len(continuation_enc)))
 
 
-def _make_padded_input(context_enc: List,
-                       continuation_enc: List,
-                       max_seq_len: int,
-                       pad_tok_id: int,
-                       padding_side: str = 'right') -> torch.Tensor:
+def _make_padded_input(
+    context_enc: List,
+    continuation_enc: List,
+    max_seq_len: int,
+    pad_tok_id: int,
+    padding_side: str = 'right',
+) -> torch.Tensor:
     """
     Takes an encoded context and continuation and clips the beginning of the context if they're too long.
     Adds the padding token to the specified side.
@@ -293,6 +295,7 @@ def __init__(
     ):
         try:
             import datasets
+
             del datasets
         except ImportError as e:
             raise MissingConditionalImportError(
@@ -364,11 +367,13 @@ def update_generation_kwargs(self, generation_kwargs: Dict) -> None:
                 self.base_batch['generation_kwargs'] = {}
             self.base_batch['generation_kwargs'].update(generation_kwargs)
 
-    def read_dataset(self,
-                     dataset_uri: str,
-                     destination_path: str,
-                     hf_loading_vars: Optional[Dict[str, Any]] = None,
-                     hf_parsing_map: Optional[Dict[str, Any]] = None) -> 'HFDataset':
+    def read_dataset(
+        self,
+        dataset_uri: str,
+        destination_path: str,
+        hf_loading_vars: Optional[Dict[str, Any]] = None,
+        hf_parsing_map: Optional[Dict[str, Any]] = None,
+    ) -> 'HFDataset':
         """
         Reads a dataset and handles parsing it from HuggingFace.
 
@@ -384,6 +389,7 @@ def read_dataset(self,
         """
         from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
         from datasets import load_dataset  # pyright: ignore[reportGeneralTypeIssues]
+
         if 'hf://' in dataset_uri:
             dataset_uri = dataset_uri.replace('hf://', '')
             if hf_loading_vars is None:
@@ -467,7 +473,7 @@ def construct_context(self, example: Dict, preceding_text: str = '', add_answer:
             ctxt = f'{self.example_delimiter}{ctxt}'
         ctxt = f'{ctxt}{self.continuation_delimiter}'
         if add_answer:
-            ctxt = f'{ctxt}{self.get_answer_from_example(example, in_context=add_answer)}'
+            ctxt = (f'{ctxt}{self.get_answer_from_example(example, in_context=add_answer)}')
         return ctxt
 
     def get_answer_from_example(self, example: Dict[str, Any], in_context: bool = False) -> str:
@@ -538,8 +544,13 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) ->
             trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.padding_size)
             assert isinstance(trimmed_context, list)
             continuation_indices = _get_continuation_span(trimmed_context, tokenized_answer)
-            padded_context = _make_padded_input(trimmed_context, tokenized_answer, self.padding_size, self.pad_tok_id,
-                                                self.padding_side)
+            padded_context = _make_padded_input(
+                trimmed_context,
+                tokenized_answer,
+                self.padding_size,
+                self.pad_tok_id,
+                self.padding_side,
+            )
 
             tokenized_example[self.context_key] = padded_context
             tokenized_example[self.answer_key] = tokenized_answer
@@ -552,8 +563,13 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) ->
                 self.padding_size,
             )
             assert isinstance(trimmed_context, list)
-            padded_context = _make_padded_input(trimmed_context, [], self.padding_size, self.pad_tok_id,
-                                                self.padding_side)
+            padded_context = _make_padded_input(
+                trimmed_context,
+                [],
+                self.padding_size,
+                self.pad_tok_id,
+                self.padding_side,
+            )
 
             tokenized_example[self.context_key] = padded_context
             tokenized_example[self.answer_key] = self.get_answer_from_example(example)
@@ -660,29 +676,38 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
         cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
     """
 
-    def __init__(self,
-                 cot_delimiter: str = '',
-                 early_stopping_criteria: Optional[List[str]] = None,
-                 do_normalization: bool = True,
-                 *args,
-                 **kwargs):
+    def __init__(
+        self,
+        cot_delimiter: str = '',
+        early_stopping_criteria: Optional[List[str]] = None,
+        do_normalization: bool = True,
+        *args,
+        **kwargs,
+    ):
         if kwargs['tokenizer'].eos_token_id is None:
             raise ValueError('`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`')
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
         self.max_answer_length = 0
         static_keys = [
-            'mode', 'cot_delimiter', 'generation_length', 'generation_kwargs', 'do_normalization', 'stopping_criteria'
+            'mode',
+            'cot_delimiter',
+            'generation_length',
+            'generation_kwargs',
+            'do_normalization',
+            'stopping_criteria',
         ]
         tensor_keys = ['input_ids', 'attention_mask']
         list_keys = ['labels']
-        super().__init__(padding_side='left',
-                         tokenize_labels=False,
-                         static_keys=static_keys,
-                         list_keys=list_keys,
-                         tensor_keys=tensor_keys,
-                         *args,
-                         **kwargs)
+        super().__init__(
+            padding_side='left',
+            tokenize_labels=False,
+            static_keys=static_keys,
+            list_keys=list_keys,
+            tensor_keys=tensor_keys,
+            *args,
+            **kwargs,
+        )
         # NOTE: set these after init call because they take class vars
         self.early_stopping_criteria = early_stopping_criteria
         self.base_batch = {
@@ -697,7 +722,7 @@ def __init__(self,
                 'pad_token_id': self.pad_tok_id,
                 'use_cache': True,
                 'eos_token_id': self.tokenizer.eos_token_id,
-            }
+            },
         }
         self.batch_mapping = {
             'input_ids': self.context_key,
@@ -782,10 +807,12 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         batch_size = batch['input_ids'].shape[0]
         stopping_criteria = None
         if self.early_stopping_criteria:
-            if stop_sequences_criteria is None:  # pyright: ignore [reportUnnecessaryComparison]
-                raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                    conda_package='transformers',
-                                                    conda_channel='conda-forge')
+            if (stop_sequences_criteria is None):  # pyright: ignore [reportUnnecessaryComparison]
+                raise MissingConditionalImportError(
+                    extra_deps_group='nlp',
+                    conda_package='transformers',
+                    conda_channel='conda-forge',
+                )
             stopping_criteria = stop_sequences_criteria(self.tokenizer, self.early_stopping_criteria, batch_size)
         batch['generation_kwargs']['stopping_criteria'] = stopping_criteria
         return batch
@@ -804,22 +831,29 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(answer_key='continuation',
-                         static_keys=['mode'],
-                         tensor_keys=['input_ids', 'continuation_indices', 'labels', 'attention_mask'],
-                         base_batch={
-                             'input_ids': [],
-                             'continuation_indices': [],
-                             'mode': 'icl_task',
-                             'labels': []
-                         },
-                         batch_mapping={
-                             'input_ids': 'context',
-                             'labels': 'context'
-                         },
-                         padding_side='right',
-                         *args,
-                         **kwargs)
+        super().__init__(
+            answer_key='continuation',
+            static_keys=['mode'],
+            tensor_keys=[
+                'input_ids',
+                'continuation_indices',
+                'labels',
+                'attention_mask',
+            ],
+            base_batch={
+                'input_ids': [],
+                'continuation_indices': [],
+                'mode': 'icl_task',
+                'labels': [],
+            },
+            batch_mapping={
+                'input_ids': 'context',
+                'labels': 'context'
+            },
+            padding_side='right',
+            *args,
+            **kwargs,
+        )
 
 
 class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
@@ -847,14 +881,16 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
         choices_key (str): The key under which the choices are stored in the saved dataset. Defaults to 'choices'.
     """
 
-    def __init__(self,
-                 choices_key: str = 'choices',
-                 static_keys: Optional[List] = None,
-                 list_of_tensors_keys: Optional[List] = None,
-                 list_of_tuples_keys: Optional[List] = None,
-                 list_of_primitives: Optional[List] = None,
-                 *args,
-                 **kwargs):
+    def __init__(
+        self,
+        choices_key: str = 'choices',
+        static_keys: Optional[List] = None,
+        list_of_tensors_keys: Optional[List] = None,
+        list_of_tuples_keys: Optional[List] = None,
+        list_of_primitives: Optional[List] = None,
+        *args,
+        **kwargs,
+    ):
         self.choices_key = choices_key
         base_batch = {
             'input_ids': [],
@@ -870,13 +906,15 @@ def __init__(self,
         self.list_of_tensors_keys = list_of_tensors_keys or ['continuation_indices']
         self.list_of_tuples_keys = list_of_tuples_keys or ['choice_groupings']
         self.list_of_primitives = list_of_primitives or ['gold_indices']
-        super().__init__(context_key=context_key,
-                         base_batch=base_batch,
-                         static_keys=static_keys,
-                         tensor_keys=tensor_keys,
-                         padding_side='right',
-                         *args,
-                         **kwargs)
+        super().__init__(
+            context_key=context_key,
+            base_batch=base_batch,
+            static_keys=static_keys,
+            tensor_keys=tensor_keys,
+            padding_side='right',
+            *args,
+            **kwargs,
+        )
         self.num_choices = len(self.dataset[0][self.choices_key])
         self.batch_mapping_per_choice = {'input_ids': 'context', 'labels': 'context'}
         self.batch_map_per_example = {'gold_indices': 'gold'}
@@ -1056,13 +1094,15 @@ def __init__(self, choices_key='context_options', *args, **kwargs):
         static_keys = ['mode']
         tensor_keys = ['input_ids', 'labels', 'attention_mask']
         list_of_tensors_keys = ['continuation_indices']
-        super().__init__(choices_key=choices_key,
-                         context_key=choices_key,
-                         static_keys=static_keys,
-                         tensor_keys=tensor_keys,
-                         list_of_tensors_keys=list_of_tensors_keys,
-                         *args,
-                         **kwargs)
+        super().__init__(
+            choices_key=choices_key,
+            context_key=choices_key,
+            static_keys=static_keys,
+            tensor_keys=tensor_keys,
+            list_of_tensors_keys=list_of_tensors_keys,
+            *args,
+            **kwargs,
+        )
         self.base_batch = {
             'input_ids': [],
             'continuation_indices': [],
@@ -1160,8 +1200,8 @@ def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         assert isinstance(preamble, list)
         preamble = self._fix_eos_on_preamble(preamble)
         encoded_contexts = [
-            preamble +  # pyright: ignore[reportOperatorIssue, reportGeneralTypeIssues]
-            self.tokenizer(c, add_special_tokens=False)['input_ids']  # pyright: ignore[reportOperatorIssue, ]
+            preamble + self.tokenizer(  # pyright: ignore[reportOperatorIssue, reportGeneralTypeIssues]
+                c, add_special_tokens=False)['input_ids']  # pyright: ignore[reportOperatorIssue, ]
             for c in context_options
         ]
         continuation = example['continuation']
@@ -1178,8 +1218,13 @@ def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
             trimmed_context = _trim_context(context, tokenized_continuation, self.padding_size)
             assert isinstance(trimmed_context, list)
             continuation_indices = _get_continuation_span(trimmed_context, tokenized_continuation)
-            padded_context = _make_padded_input(trimmed_context, tokenized_continuation, self.padding_size,
-                                                self.pad_tok_id, self.padding_side)
+            padded_context = _make_padded_input(
+                trimmed_context,
+                tokenized_continuation,
+                self.padding_size,
+                self.pad_tok_id,
+                self.padding_side,
+            )
             tokenized_example[self.context_key].append(padded_context)
             tokenized_example['continuation_indices'].append(continuation_indices)
             tokenized_example[self.answer_key].append(tokenized_continuation)
@@ -1222,7 +1267,6 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
 
         - pad_token_id: ID for padding token, derived automatically
         - num_beams: How many beams to search for generations, set to 1
-        - num_return_sequences: Value passed for 'generations_per_sample', how many generations per prompt
         - do_sample: Determines whether model is sampling or greedily decoding. Always set to True
         - use_cache: Whether or not to use past key values to speed up sampling. Always set to True
 
@@ -1234,11 +1278,13 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     def __init__(
         self,
         generations_per_sample: int,
-        pass_at_k: int = 1,
+        pass_at_k: Union[int, list[int]] = 1,
         *args,
         **kwargs,
     ):
-        if generations_per_sample < pass_at_k:
+        if isinstance(pass_at_k, int):
+            pass_at_k = [pass_at_k]
+        if generations_per_sample < max(pass_at_k):
             raise ValueError(
                 f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
             )
@@ -1250,13 +1296,30 @@ def __init__(
             'entry_points': 'entry_point',
             'test_inputs': 'test_inputs',
             'test_outputs': 'test_outputs',
-            'languages': 'language'
+            'languages': 'language',
+            'sample_id': 'sample_id',
         }
         # Linting complains if these are not set in init
         self.max_prompt_length = 0
         self.max_answer_length = 0
-        static_keys = ['mode', 'pass_at_k', 'generation_length', 'generation_kwargs']
-        list_keys = ['prompts', 'tests', 'entry_points', 'test_inputs', 'test_outputs', 'languages', 'labels']
+        static_keys = [
+            'mode',
+            'pass_at_k',
+            'generation_length',
+            'generation_kwargs',
+            'generations_per_sample',
+            'dataset_size',
+        ]
+        list_keys = [
+            'prompts',
+            'tests',
+            'entry_points',
+            'test_inputs',
+            'test_outputs',
+            'languages',
+            'labels',
+            'sample_id',
+        ]
         tensor_keys = ['input_ids', 'attention_mask']
         super().__init__(
             context_key='prompt',
@@ -1272,7 +1335,9 @@ def __init__(
             **kwargs,
         )
         self._set_max_prompt_and_answer_lengths()
+        dataset_size = len(self.dataset)
         self.dataset = self.dataset.map(self._trim_padding)
+        self.dataset = self.repeat_dataset(self.dataset, generations_per_sample)
         self.base_batch = {
             'input_ids': [],
             'mode': 'generate',
@@ -1288,15 +1353,33 @@ def __init__(
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 'num_beams': 1,  # single beam
-                'num_return_sequences': generations_per_sample,
                 'do_sample': True,
+                'temperature': 0.2,  # good default for code
                 'use_cache': True,
-                'eos_token_id': self.tokenizer.eos_token_id
-            }
+                'eos_token_id': self.tokenizer.eos_token_id,
+            },
+            'sample_id': [],
+            'pass_at_k': list(pass_at_k),
+            'generations_per_sample': generations_per_sample,
+            'dataset_size': dataset_size,
         }
         if 'generation_kwargs' in kwargs:
             self.update_generation_kwargs(kwargs['generation_kwargs'])
 
+    def repeat_dataset(self, dataset: HFDataset, repetitions: int) -> HFDataset:
+
+        def _repeat_dataset():
+            for i, sample in enumerate(dataset):
+                for _ in range(repetitions):
+                    assert isinstance(sample, dict)
+                    yield {'sample_id': i, **sample}
+
+        from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+
+        repeated_dataset = HFDataset.from_generator(_repeat_dataset)
+        assert isinstance(repeated_dataset, HFDataset)
+        return repeated_dataset
+
     def _set_max_prompt_and_answer_lengths(self):
         """
         Iterates through the dataset and finds the maximum prompt length and sequence lengths
@@ -1355,27 +1438,28 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) ->
 
 
 def build_icl_dataloader(
-        icl_task_type: str,
-        dataset_uri: str,
-        tokenizer: transformers.PreTrainedTokenizerBase,
-        batch_size: int,
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,  # e.g. 'translate english to french:'
-        example_delimiter: str,  # e.g. '\n'
-        continuation_delimiter: str,  # e.g. ''
-        hf_loading_vars: Dict,
-        hf_parsing_map: Dict,
-        destination_path: str,
-        prelimiter: str,  # e.g. 'Question: '
-        cot_delimiter: str,  # e.g. ' ### '
-        fewshot_random_seed: int,
-        pass_at_k: int,
-        generations_per_sample: int,
-        generation_kwargs: Dict,
-        early_stopping_criteria: Optional[List[str]] = None,
-        do_normalization: bool = True) -> DataSpec:
+    icl_task_type: str,
+    dataset_uri: str,
+    tokenizer: transformers.PreTrainedTokenizerBase,
+    batch_size: int,
+    max_seq_len: int,
+    pad_tok_id: int,
+    num_fewshot: int,
+    prompt_string: str,  # e.g. 'translate english to french:'
+    example_delimiter: str,  # e.g. '\n'
+    continuation_delimiter: str,  # e.g. ''
+    hf_loading_vars: Dict,
+    hf_parsing_map: Dict,
+    destination_path: str,
+    prelimiter: str,  # e.g. 'Question: '
+    cot_delimiter: str,  # e.g. ' ### '
+    fewshot_random_seed: int,
+    pass_at_k: Union[int, list[int]],
+    generations_per_sample: int,
+    generation_kwargs: Dict,
+    early_stopping_criteria: Optional[List[str]] = None,
+    do_normalization: bool = True,
+) -> DataSpec:
     """
     Factory method that builds the specific dataset for the specified icl_task_type.
     See documentation for `get_icl_task_dataloader` for arugment documentation.
@@ -1574,28 +1658,29 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
 
 
 def get_icl_task_dataloader(
-        icl_task_type: str,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        batch_size: int,
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,  # e.g. 'translate english to french:'
-        example_delimiter: str,  # e.g. '\n'
-        continuation_delimiter: str = '',
-        destination_path: str = '',
-        question_prelimiter: str = '',  # e.g. 'Question: '
-        fewshot_random_seed: int = 1234,
-        pass_at_k: int = 1,
-        generations_per_sample: int = 1,
-        cot_delimiter: str = '',
-        has_categories: bool = False,
-        hf_loading_vars: Optional[Dict] = None,
-        hf_parsing_map: Optional[Dict] = None,
-        generation_kwargs: Optional[Dict] = None,
-        early_stopping_criteria: Optional[List[str]] = None,
-        do_normalization: bool = True) -> Union[DataSpec, Dict[str, DataSpec]]:
+    icl_task_type: str,
+    dataset_uri: str,
+    tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+    batch_size: int,
+    max_seq_len: int,
+    pad_tok_id: int,
+    num_fewshot: int,
+    prompt_string: str,  # e.g. 'translate english to french:'
+    example_delimiter: str,  # e.g. '\n'
+    continuation_delimiter: str = '',
+    destination_path: str = '',
+    question_prelimiter: str = '',  # e.g. 'Question: '
+    fewshot_random_seed: int = 1234,
+    pass_at_k: int = 1,
+    generations_per_sample: int = 1,
+    cot_delimiter: str = '',
+    has_categories: bool = False,
+    hf_loading_vars: Optional[Dict] = None,
+    hf_parsing_map: Optional[Dict] = None,
+    generation_kwargs: Optional[Dict] = None,
+    early_stopping_criteria: Optional[List[str]] = None,
+    do_normalization: bool = True,
+) -> Union[DataSpec, Dict[str, DataSpec]]:
     """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
 
         .. testsetup::
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index b4815ea702..9f53d095b7 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -16,6 +16,7 @@
 from torch.nn import functional as F
 from torchmetrics import Metric
 
+from composer.utils import dist
 from composer.utils.eval_client import EvalClient, LambdaEvalClient, LocalEvalClient, MosaicMLLambdaEvalClient
 
 log = logging.getLogger(__name__)
@@ -601,8 +602,8 @@ class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
     def __init__(self, dist_sync_on_step: bool = False):
         # state from multiple processes
         super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
-        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
+
+        self._initialized = False
 
         self.eval_device = os.environ.get('CODE_EVAL_DEVICE', None)
         if self.eval_device is not None:
@@ -646,6 +647,18 @@ def estimator(self, n: int, c: int, k: int) -> float:
             return 1.0
         return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
 
+    def _initialize_state(self, batch: dict[str, Any]):
+        device = batch['input_ids'].device
+        self.dataset_size = batch['dataset_size']
+        self.pass_at_k = batch['pass_at_k']
+        self.num_generations = batch['generations_per_sample']
+
+        # We need to defer the accumulator initialization because it depends on dataset size
+        self.add_state('correct', default=torch.zeros(self.dataset_size, device=device), dist_reduce_fx='sum')
+        self.add_state('total', default=torch.zeros(self.dataset_size, device=device), dist_reduce_fx='sum')
+        dist.barrier()
+        self._initialized = True
+
     def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
         """Updates the pass@k accuracy of code generation.
 
@@ -670,51 +683,62 @@ def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
             labels (List[str]): A list of the correct code generations, for compatibility with existing HF generate
             functionalities. This is not used.
         """
+        if not self._initialized:
+            self._initialize_state(batch)
+
         del labels  # never used
         client = self.get_client()
 
-        pass_at_k = batch['pass_at_k']
-        num_generations = batch['generation_kwargs']['num_return_sequences']
-        processed_outputs = [
-            outputs[i * num_generations:(i + 1) * num_generations] for i in range(len(batch['prompts']))
-        ]
-        payloads = []
-        for sample_outputs, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
-                processed_outputs, batch['prompts'], batch['test_inputs'], batch['test_outputs'], batch['entry_points'],
-                batch['languages']):
-            self.total += torch.tensor(1.0)
-            prompt_payload = []
-            for code_gen in sample_outputs:
-                code_gen = re.split(r'\n[A-Za-z0-9#`]', code_gen)[0]  # remove everything after function ends
-                final_code = sample_prompt + code_gen  # combine prompt with the code generation
-                generation_payload = []
-                for test_input, test_output in zip(test_inputs, test_outputs):
-                    payload = {
-                        'code': final_code,
-                        'input': test_input,
-                        'output': test_output,
-                        'entry_point': entry_point,
-                        'language': language,
-                    }
-                    generation_payload.append(payload)
-
-                prompt_payload.append(generation_payload)
-            payloads.append(prompt_payload)
-
-        results = client.invoke(payloads)
-        for prompt in results:
-            num_correct = 0
-            for generation in prompt:
-                correct = all(generation)
-                if correct:
-                    num_correct += 1
-
-            pass_at_k_rate = self.estimator(num_generations, num_correct, pass_at_k)
-            self.correct += torch.tensor(pass_at_k_rate)
+        for sample_id, code_gen, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
+                batch['sample_id'], outputs, batch['prompts'], batch['test_inputs'], batch['test_outputs'],
+                batch['entry_points'], batch['languages']):
+
+            idx = sample_id
+            self.total[idx] += 1.0
+
+            code_gen = re.split(r'\n[A-Za-z0-9#`]', code_gen)[0]  # remove everything after function ends
+            final_code = sample_prompt + code_gen  # combine prompt with the code generation
+
+            test_results = []
+            for test_input, test_output in zip(test_inputs, test_outputs):
+                payload = {
+                    'code': final_code,
+                    'input': test_input,
+                    'output': test_output,
+                    'entry_point': entry_point,
+                    'language': language,
+                }
+
+                result = client.invoke([[[payload]]])[0][0][0]
+                test_results.append(result)
+
+            if all(test_results):
+                self.correct[idx] += 1.0
 
         client.close()  # pyright: ignore [reportOptionalMemberAccess]
 
     def compute(self):
         assert isinstance(self.correct, Tensor)
         assert isinstance(self.total, Tensor)
-        return self.correct / self.total
+        complete = self.total == self.num_generations  # so that eval subset batches can be used
+
+        if complete.sum() < (self.total != 0).sum():
+            warnings.warn('Some samples in the dataset have less than the expected number of generations. '
+                          'This is expected if you are using a subset of the dataset for evaluation.')
+
+        if (self.correct > self.total).any().item():
+            raise ValueError(
+                'Internal error some samples have more correct than  total generations. This should not happen.')
+
+        results = {}
+        n = self.num_generations
+
+        for k in self.pass_at_k:
+            pass_at_k = sum([self.estimator(n, int(c.item()), k) for c in self.correct[complete]
+                            ]) / complete.sum().item()
+            results[f'pass@{k}'] = torch.tensor(pass_at_k)
+
+        if len(results) == 1:  # backwards compatibility
+            return list(results.values())[0]
+
+        return results
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 063f7215bc..3611e20dd1 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -1459,7 +1459,7 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
         'code_evaluation',
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
-        batch_size=8,
+        batch_size=5,
         max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=2,
@@ -1467,28 +1467,16 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
         example_delimiter='\n',
         continuation_delimiter='',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-        generations_per_sample=4,
+        generations_per_sample=3,
     )
 
     assert isinstance(dl, DataSpec)  # pyright
+    batches = list(dl.dataloader)
 
-    batch = next(iter(dl.dataloader))
-    split_batch = dl.split_batch(batch, 3)
+    for k in ('input_ids', 'attention_mask'):
+        assert [b[k].shape[0] for b in batches] == [5, 5, 2]
 
-    assert len(split_batch) == 2
-    split1 = split_batch[0]
-    split2 = split_batch[1]
-
-    assert split1['input_ids'].shape[0] == 3
-    assert split2['input_ids'].shape[0] == 1
-
-    assert split1['attention_mask'].shape[0] == 3
-    assert split2['attention_mask'].shape[0] == 1
-
-    assert isinstance(split1['mode'], str)
-    assert isinstance(split2['mode'], str)
-
-    list_split = {
+    list_keys = {
         'labels': str,
         'prompts': str,
         'tests': str,
@@ -1497,19 +1485,16 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
         'test_outputs': list,
         'languages': str,
     }
-    for k, v in list_split.items():
-        assert len(split1[k]) == 3
-        assert len(split2[k]) == 1
-        assert all(isinstance(val, v) for val in split1[k] + split2[k])
 
-    assert isinstance(split1['pass_at_k'], int)
-    assert isinstance(split2['pass_at_k'], int)
+    for batch, size in zip(batches, [5, 5, 2]):
+        for field, type_ in list_keys.items():
+            assert len(batch[field]) == size
+            assert all(isinstance(val, type_) for val in batch[field])
 
-    assert isinstance(split1['generation_length'], int)
-    assert isinstance(split2['generation_length'], int)
-
-    assert isinstance(split1['generation_kwargs'], dict)
-    assert isinstance(split2['generation_kwargs'], dict)
+    static_keys = {'pass_at_k': (int, list), 'generation_length': int, 'generation_kwargs': dict}
+    for batch in batches:
+        for field, type_ in static_keys.items():
+            assert isinstance(batch[field], type_)
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
@@ -1524,7 +1509,7 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
 
     tokenizer = tiny_llama_tokenizer
     dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
+    batch_size = 5
     seqlen = 2048
 
     dl = get_icl_task_dataloader('code_evaluation',
@@ -1543,43 +1528,53 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
     assert isinstance(dl, DataSpec)
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
+    batches = list(dl.dataloader)
+    dataset_size = len(open(dataset_uri, 'r').read().strip().split('\n'))
+    dataset_size *= generations_per_sample
 
     max_prompt_length = 0
-    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
-        max_prompt_length = dl.dataloader.dataset.max_prompt_length
-    assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-    assert batch['generation_length'] == 129
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
-
-    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
 
-    if len(prompt_string) > 0:
-        assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
-
-    assert batch['labels'] == [
+    has_left_padding = []
+    for i, batch in enumerate(batches):
+        if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
+            max_prompt_length = dl.dataloader.dataset.max_prompt_length
+        N = len(batches)
+        bs = batch_size if i < N - 1 else dataset_size - (N - 1) * batch_size
+        assert tuple(batch['input_ids'].shape) == (bs, max_prompt_length)
+        assert tuple(batch['attention_mask'].shape) == (bs, max_prompt_length)
+        assert batch['mode'] == 'generate'
+        # the maximum generation length from the small test data
+        assert batch['generation_length'] == 129
+        has_left_padding.extend([item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
+    assert not all(has_left_padding)  # longest should be pushed left
+
+    decoded_batches = [tokenizer.batch_decode(batch['input_ids']) for batch in batches]
+    for decoded_batch in decoded_batches:
+        assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+
+        if len(prompt_string) > 0:
+            assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
+
+    labels = [
         '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
         "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n",
         '    return number % 1.0\n',
         '    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n',
     ]
 
-    assert decoded_batch[0].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
-    )
-    assert decoded_batch[1].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n"
-    )
-    assert decoded_batch[2].endswith(
-        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n"
-    )
-    assert decoded_batch[3].endswith(
+    # assert decoded_batch[0].endswith(
+    samples = [
+        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n",
+        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n",
+        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n",
         "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n"
-    )
+    ]
+    for i in range(4):
+        for j in range(generations_per_sample):
+            k = i * generations_per_sample + j
+            b, n = divmod(k, batch_size)
+            assert batches[b]['labels'][n] == labels[i]
+            assert decoded_batches[b][n].endswith(samples[i])
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
@@ -1693,43 +1688,52 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
     assert isinstance(dl, DataSpec)
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    max_prompt_length = 0
-    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
-        max_prompt_length = dl.dataloader.dataset.max_prompt_length
-    assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-    assert batch['generation_length'] == 122
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
-
-    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
-
-    if len(prompt_string) > 0:
-        assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
-
-    assert batch['labels'] == [
+    batches = list(dl.dataloader)
+    dataset_size = len(open(dataset_uri, 'r').read().strip().split('\n'))
+    dataset_size *= generations_per_sample
+
+    has_left_padding = []
+    for i, batch in enumerate(batches):
+        max_prompt_length = 0
+        if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
+            max_prompt_length = dl.dataloader.dataset.max_prompt_length
+        N = len(batches)
+        bs = batch_size if i < N - 1 else dataset_size - (N - 1) * batch_size
+        assert tuple(batch['input_ids'].shape) == (bs, max_prompt_length)
+        assert tuple(batch['attention_mask'].shape) == (bs, max_prompt_length)
+        assert batch['mode'] == 'generate'
+        # the maximum generation length from the small test data
+        assert batch['generation_length'] == 122
+        has_left_padding.extend([item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
+    assert not all(has_left_padding)  # longest should be pushed left
+
+    decoded_batches = [tokenizer.batch_decode(batch['input_ids']) for batch in batches]
+    for decoded_batch in decoded_batches:
+        assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+
+        if len(prompt_string) > 0:
+            assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
+
+    labels = [
         '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
         "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n",
         '    return number % 1.0\n',
         '    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n',
     ]
 
-    assert decoded_batch[0].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
-    )
-    assert decoded_batch[1].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n"
-    )
-    assert decoded_batch[2].endswith(
-        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n"
-    )
-    assert decoded_batch[3].endswith(
+    # assert decoded_batch[0].endswith(
+    samples = [
+        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n",
+        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n",
+        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n",
         "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n"
-    )
+    ]
+    for i in range(4):
+        for j in range(generations_per_sample):
+            k = i * generations_per_sample + j
+            b, n = divmod(k, batch_size)
+            assert batches[b]['labels'][n] == labels[i]
+            assert decoded_batches[b][n].endswith(samples[i])
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
@@ -1779,7 +1783,6 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path
         assert microbatch['generation_kwargs']['top_k'] == 40
         assert microbatch['generation_kwargs']['pad_token_id'] == 0
         assert microbatch['generation_kwargs']['num_beams'] == 1
-        assert microbatch['generation_kwargs']['num_return_sequences'] == 1
         assert microbatch['generation_kwargs']['do_sample'] == True
         assert microbatch['generation_kwargs']['use_cache'] == True
         assert microbatch['generation_kwargs']['eos_token_id'] == 0
@@ -2220,7 +2223,7 @@ def test_code_eval_microbatching(monkeypatch, device, world_size, tiny_opt_token
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
     torch.use_deterministic_algorithms(False)
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    trainer.eval(eval_dataloader=evaluator)
     torch.use_deterministic_algorithms(True)
     assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
     assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
@@ -2268,7 +2271,7 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_few
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
     torch.use_deterministic_algorithms(False)
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    trainer.eval(eval_dataloader=evaluator)
     torch.use_deterministic_algorithms(True)
     assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
     assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
@@ -2317,7 +2320,7 @@ def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot,
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
     torch.use_deterministic_algorithms(False)
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    trainer.eval(eval_dataloader=evaluator)
     torch.use_deterministic_algorithms(True)
     assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
     assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
diff --git a/tests/metrics/test_nlp_metrics.py b/tests/metrics/test_nlp_metrics.py
index e31cd4d410..64df57cb34 100644
--- a/tests/metrics/test_nlp_metrics.py
+++ b/tests/metrics/test_nlp_metrics.py
@@ -312,20 +312,33 @@ def test_in_context_learning_code_eval_accuracy(monkeypatch):
     entry_points = ['fib', 'multiply_by_two', 'add_one']
     test_inputs = [['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)']]
     test_outputs = [['1', '2', '5'], ['2', '4', '8'], ['2', '3', '5']]
+    sample_ids = [0, 1, 2]
     languages = ['python', 'python', 'python']
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
+    generations_per_sample = 2
+
+    def repeat(values):
+        return [val for val in values for _ in range(generations_per_sample)]
+
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    tokenizer.pad_token = tokenizer.eos_token
+    input_ids = tokenizer.batch_encode_plus(repeat(prompts), return_tensors='pt', padding=True)['input_ids']
     batch = {
         # This tests deterministic beam search rather than sampling
+        'input_ids': input_ids,
         'generation_kwargs': {
             'num_beams': 1,
-            'num_return_sequences': 2
         },
-        'prompts': prompts,
-        'pass_at_k': 1,
-        'entry_points': entry_points,
-        'test_inputs': test_inputs,
-        'test_outputs': test_outputs,
-        'languages': languages,
+        'prompts': repeat(prompts),
+        'pass_at_k': [1],
+        'entry_points': repeat(entry_points),
+        'test_inputs': repeat(test_inputs),
+        'test_outputs': repeat(test_outputs),
+        'languages': repeat(languages),
+        'dataset_size': len(prompts),
+        'generations_per_sample': generations_per_sample,
+        'sample_id': repeat(sample_ids),
     }
     metric = InContextLearningCodeEvalAccuracy()
     metric.update(batch, outputs, labels)

From 6e28c883ada2499baa09fb28d001abd5c9d84646 Mon Sep 17 00:00:00 2001
From: Jane Zhang <jane.zhang@databricks.com>
Date: Thu, 15 Feb 2024 17:11:49 -0800
Subject: [PATCH 03/28] Use Mosaic constant for GPU file prefix (#3018)

* added new constant

* using local instead of global rank
---
 composer/cli/launcher.py            | 5 +++--
 composer/loggers/mosaicml_logger.py | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/composer/cli/launcher.py b/composer/cli/launcher.py
index c72e1b01db..c25772621a 100755
--- a/composer/cli/launcher.py
+++ b/composer/cli/launcher.py
@@ -22,7 +22,8 @@
 import torch
 
 import composer
-from composer.loggers.mosaicml_logger import MOSAICML_LOG_DIR_ENV_VAR, MOSAICML_PLATFORM_ENV_VAR
+from composer.loggers.mosaicml_logger import (MOSAICML_GPU_LOG_FILE_PREFIX_ENV_VAR, MOSAICML_LOG_DIR_ENV_VAR,
+                                              MOSAICML_PLATFORM_ENV_VAR)
 from composer.utils import get_free_tcp_port
 
 CLEANUP_TIMEOUT = datetime.timedelta(seconds=30)
@@ -485,7 +486,7 @@ def main():
             MOSAICML_PLATFORM_ENV_VAR,
             'false').lower() == 'true' and str(os.environ.get(MOSAICML_LOG_DIR_ENV_VAR, 'false')).lower() != 'false':
         log.info('Logging all GPU ranks to Mosaic Platform.')
-        log_file_format = f'{os.environ.get(MOSAICML_LOG_DIR_ENV_VAR)}/gpu_{{rank}}.txt'
+        log_file_format = f'{os.environ.get(MOSAICML_LOG_DIR_ENV_VAR)}/{os.environ.get(MOSAICML_GPU_LOG_FILE_PREFIX_ENV_VAR)}{{local_rank}}.txt'
         if args.stderr is not None or args.stdout is not None:
             warnings.warn(
                 'Logging to Mosaic Platform. Ignoring provided stdout and stderr args. To use provided stdout and stderr, set MOSAICML_LOG_DIR=false.'
diff --git a/composer/loggers/mosaicml_logger.py b/composer/loggers/mosaicml_logger.py
index cab710a4c3..fa317d6f8c 100644
--- a/composer/loggers/mosaicml_logger.py
+++ b/composer/loggers/mosaicml_logger.py
@@ -37,6 +37,7 @@
 MOSAICML_PLATFORM_ENV_VAR = 'MOSAICML_PLATFORM'
 MOSAICML_ACCESS_TOKEN_ENV_VAR = 'MOSAICML_ACCESS_TOKEN_FILE'
 MOSAICML_LOG_DIR_ENV_VAR = 'MOSAICML_LOG_DIR'
+MOSAICML_GPU_LOG_FILE_PREFIX_ENV_VAR = 'MOSAICML_GPU_LOG_FILE_PREFIX'
 
 
 class MosaicMLLogger(LoggerDestination):

From 6096f65dd4e33f62f58f73ce40b13be83d0e5371 Mon Sep 17 00:00:00 2001
From: Jane Zhang <jane.zhang@databricks.com>
Date: Fri, 16 Feb 2024 08:59:42 -0800
Subject: [PATCH 04/28] Fall back to normal logging when gpu prefix is not
 present (#3020)

* added new condition

* removed a comma

* Update composer/cli/launcher.py

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>

---------

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 composer/cli/launcher.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/composer/cli/launcher.py b/composer/cli/launcher.py
index c25772621a..3dd33aede7 100755
--- a/composer/cli/launcher.py
+++ b/composer/cli/launcher.py
@@ -482,9 +482,9 @@ def main():
         args.stderr = f'{log_tmpdir.name}/rank{{rank}}.stderr.txt'
 
     # If running on the Mosaic platform, log all gpu ranks' stderr and stdout to Mosaic platform
-    if os.environ.get(
-            MOSAICML_PLATFORM_ENV_VAR,
-            'false').lower() == 'true' and str(os.environ.get(MOSAICML_LOG_DIR_ENV_VAR, 'false')).lower() != 'false':
+    if os.environ.get(MOSAICML_PLATFORM_ENV_VAR, 'false').lower() == 'true' and str(
+            os.environ.get(MOSAICML_LOG_DIR_ENV_VAR, 'false')).lower() != 'false' and os.environ.get(
+                MOSAICML_GPU_LOG_FILE_PREFIX_ENV_VAR, 'false').lower() != 'false':
         log.info('Logging all GPU ranks to Mosaic Platform.')
         log_file_format = f'{os.environ.get(MOSAICML_LOG_DIR_ENV_VAR)}/{os.environ.get(MOSAICML_GPU_LOG_FILE_PREFIX_ENV_VAR)}{{local_rank}}.txt'
         if args.stderr is not None or args.stdout is not None:

From 0480169ccca7644509d95d3619a137f987af63a5 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 16 Feb 2024 13:24:50 -0500
Subject: [PATCH 05/28] Revert "Use testing repo actions (#2948)" (#3023)

This reverts commit c47e012fd3ec79eed50b5842a56acd67ac8ef220.
---
 .github/workflows/code-quality.yaml    | 57 +++++++++-----------------
 .github/workflows/codeql-analysis.yml  | 47 ++++++++++++++-------
 .github/workflows/coverage.yaml        | 37 ++++++++---------
 .github/workflows/daily.yaml           |  3 --
 .github/workflows/pr-code-quality.yaml | 28 +++++++++++++
 .github/workflows/pr-cpu.yaml          |  3 --
 .github/workflows/smoketest.yaml       | 29 ++++++-------
 7 files changed, 108 insertions(+), 96 deletions(-)
 create mode 100644 .github/workflows/pr-code-quality.yaml

diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
index e3400b81b4..20bbf327b7 100644
--- a/.github/workflows/code-quality.yaml
+++ b/.github/workflows/code-quality.yaml
@@ -1,17 +1,13 @@
 name: Code Quality Checks
 on:
-  push:
-    branches:
-    - dev
-    - main
-    - release/**
-  pull_request:
-  workflow_dispatch:
-# Cancel old runs when a new commit is pushed to the same branch if not on main
-# or dev
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
+  workflow_call:
+    inputs:
+      python_version:
+        required: true
+        type: string
+      pip_deps:
+        required: true
+        type: string
 defaults:
   run:
     working-directory: .
@@ -19,31 +15,16 @@ jobs:
   code-quality:
     runs-on: ubuntu-20.04
     timeout-minutes: 15
-    strategy:
-      matrix:
-        python_version:
-        - "3.9"
-        - "3.10"
-        - "3.11"
-        pip_deps:
-        - "[dev]"
     steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-    - name: Get Repo Token
-      id: REPO_TOKEN
-      uses: tibdex/github-app-token@v1
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
       with:
-        app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
-        private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
-    - name: Get composite run steps repository
-      uses: actions/checkout@v3
-      with:
-        repository: mosaicml/testing
-        ref: v0.0.2
-        path: ./testing
-        token: ${{ steps.REPO_TOKEN.outputs.token }}
-    - uses: ./testing/.github/actions/code-quality
-      with:
-        python_version: ${{ matrix.python_version }}
-        pip_deps: ${{ matrix.pip_deps }}
+        python-version: ${{ inputs.python_version }}
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        python -m pip install --upgrade .${{ inputs.pip_deps }}
+    - name: Run checks
+      run: |
+        pre-commit run --all-files
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 5a4ddb477c..151179d524 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -41,19 +41,36 @@ jobs:
     steps:
     - name: Checkout repository
       uses: actions/checkout@v2
-    - name: Get Repo Token
-      id: REPO_TOKEN
-      uses: tibdex/github-app-token@v1
-      with:
-        app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
-        private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
-    - name: Get composite run steps repository
-      uses: actions/checkout@v3
-      with:
-        repository: mosaicml/testing
-        ref: v0.0.2
-        path: ./testing
-        token: ${{ steps.REPO_TOKEN.outputs.token }}
-    - uses: ./testing/.github/actions/codeql-analysis
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v2
       with:
-        language: ${{ matrix.language }}
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a
+        # config file.
+        # By default, queries listed here will override any specified in a
+        # config file.
+        # Prefix the list here with "+" to use these queries and those in the
+        # config file.
+        # queries: ./path/to/local/query, your-org/your-repo/queries@main
+
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually
+    # (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v2
+
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 https://git.io/JvXDl
+
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following
+    #    three lines and modify them (or add more) to build your code if your
+    #    project uses a compiled language
+
+    # - run: |
+    #   make bootstrap
+    #   make release
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v2
diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
index fe69f936ab..f89d67ec39 100644
--- a/.github/workflows/coverage.yaml
+++ b/.github/workflows/coverage.yaml
@@ -5,11 +5,6 @@ on:
       download-path:
         required: true
         type: string
-    secrets:
-      app_id:
-        required: true
-      private_key:
-        required: true
 jobs:
   coverage:
     timeout-minutes: 5
@@ -17,19 +12,21 @@ jobs:
     steps:
     - name: Checkout Repo
       uses: actions/checkout@v3
-    - name: Get Repo Token
-      id: REPO_TOKEN
-      uses: tibdex/github-app-token@v1
-      with:
-        app_id: ${{ secrets.app_id }}
-        private_key: ${{ secrets.private_key }}
-    - name: Get composite run steps repository
-      uses: actions/checkout@v3
-      with:
-        repository: mosaicml/testing
-        ref: v0.0.2
-        path: ./testing
-        token: ${{ steps.REPO_TOKEN.outputs.token }}
-    - uses: ./testing/.github/actions/coverage
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        pip install coverage[toml]==6.5.0
+    - name: Download artifacts
+      uses: actions/download-artifact@v3
       with:
-        download-path: ${{ inputs.download-path }}
+        path: ${{ inputs.download-path }}
+    - name: Generate coverage report
+      run: |
+        set -ex
+
+        # Flatten the coverage files
+        ls ${{ inputs.download-path }} | while read x; do mv ${{ inputs.download-path }}/$x/.coverage .coverage.$x; done
+
+        python -m coverage combine
+        python -m coverage report
diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index 3867a75b71..3c65b0f4fa 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -97,9 +97,6 @@ jobs:
     needs: [daily-pytest-cpu]
     with:
       download-path: artifacts
-    secrets:
-      app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
-      private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
 
   daily-pytest-gpu:
     uses: ./.github/workflows/pytest-gpu.yaml
diff --git a/.github/workflows/pr-code-quality.yaml b/.github/workflows/pr-code-quality.yaml
new file mode 100644
index 0000000000..26d2546e75
--- /dev/null
+++ b/.github/workflows/pr-code-quality.yaml
@@ -0,0 +1,28 @@
+name: PR Code Quality Checks
+on:
+  push:
+    branches:
+    - dev
+    - main
+    - release/**
+  pull_request:
+  workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main
+# or dev
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
+jobs:
+  code-quality:
+    uses: ./.github/workflows/code-quality.yaml
+    strategy:
+      matrix:
+        python_version:
+        - "3.9"
+        - "3.10"
+        - "3.11"
+        pip_deps:
+        - "[dev]"
+    with:
+      python_version: ${{ matrix.python_version }}
+      pip_deps: ${{ matrix.pip_deps }}
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index c17dd32f0a..6eee54cb0b 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -43,6 +43,3 @@ jobs:
     needs: [pytest-cpu]
     with:
       download-path: artifacts
-    secrets:
-      app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
-      private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
index 7d30a7b0e5..429cc40b1d 100644
--- a/.github/workflows/smoketest.yaml
+++ b/.github/workflows/smoketest.yaml
@@ -27,21 +27,16 @@ jobs:
         - "3.10"
         - "3.11"
     steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-    - name: Get Repo Token
-      id: REPO_TOKEN
-      uses: tibdex/github-app-token@v1
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
       with:
-        app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
-        private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
-    - name: Get composite run steps repository
-      uses: actions/checkout@v3
-      with:
-        repository: mosaicml/testing
-        ref: v0.0.2
-        path: ./testing
-        token: ${{ steps.REPO_TOKEN.outputs.token }}
-    - uses: ./testing/.github/actions/smoketest
-      with:
-        python_version: ${{ matrix.python_version }}
+        python-version: ${{ matrix.python_version }}
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        python -m pip install --upgrade .
+        python -m pip install pytest==7.2.1 pytest_codeblocks==0.16.1
+    - name: Run checks
+      run: |
+        pytest tests/test_smoketest.py

From 7d6afe17a6fdcb97863ff7776502babfe2056d03 Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Fri, 16 Feb 2024 11:08:52 -0800
Subject: [PATCH 06/28] Change to pull_request_target  (#3025)

* Use testing repo actions (#2948)
* Change to pull_request_target
---
 .github/workflows/code-quality.yaml    | 57 +++++++++++++++++---------
 .github/workflows/codeql-analysis.yml  | 49 ++++++++--------------
 .github/workflows/coverage.yaml        | 37 +++++++++--------
 .github/workflows/daily.yaml           |  3 ++
 .github/workflows/pr-code-quality.yaml | 28 -------------
 .github/workflows/pr-cpu.yaml          |  5 ++-
 .github/workflows/smoketest.yaml       | 31 ++++++++------
 7 files changed, 99 insertions(+), 111 deletions(-)
 delete mode 100644 .github/workflows/pr-code-quality.yaml

diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
index 20bbf327b7..9e00def203 100644
--- a/.github/workflows/code-quality.yaml
+++ b/.github/workflows/code-quality.yaml
@@ -1,13 +1,17 @@
 name: Code Quality Checks
 on:
-  workflow_call:
-    inputs:
-      python_version:
-        required: true
-        type: string
-      pip_deps:
-        required: true
-        type: string
+  push:
+    branches:
+    - dev
+    - main
+    - release/**
+  pull_request_target:
+  workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main
+# or dev
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
 defaults:
   run:
     working-directory: .
@@ -15,16 +19,31 @@ jobs:
   code-quality:
     runs-on: ubuntu-20.04
     timeout-minutes: 15
+    strategy:
+      matrix:
+        python_version:
+        - "3.9"
+        - "3.10"
+        - "3.11"
+        pip_deps:
+        - "[dev]"
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v4
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Get Repo Token
+      id: REPO_TOKEN
+      uses: tibdex/github-app-token@v1
       with:
-        python-version: ${{ inputs.python_version }}
-    - name: Setup
-      run: |
-        set -ex
-        python -m pip install --upgrade 'pip<23' wheel
-        python -m pip install --upgrade .${{ inputs.pip_deps }}
-    - name: Run checks
-      run: |
-        pre-commit run --all-files
+        app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
+        private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
+    - name: Get composite run steps repository
+      uses: actions/checkout@v3
+      with:
+        repository: mosaicml/testing
+        ref: v0.0.2
+        path: ./testing
+        token: ${{ steps.REPO_TOKEN.outputs.token }}
+    - uses: ./testing/.github/actions/code-quality
+      with:
+        python_version: ${{ matrix.python_version }}
+        pip_deps: ${{ matrix.pip_deps }}
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 151179d524..511c70aa85 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -14,7 +14,7 @@ name: "CodeQL"
 on:
   push:
     branches: [dev, main]
-  pull_request:
+  pull_request_target:
     # The branches below must be a subset of the branches above
     branches: [dev, main]
   schedule:
@@ -41,36 +41,19 @@ jobs:
     steps:
     - name: Checkout repository
       uses: actions/checkout@v2
-
-    # Initializes the CodeQL tools for scanning.
-    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
+    - name: Get Repo Token
+      id: REPO_TOKEN
+      uses: tibdex/github-app-token@v1
       with:
-        languages: ${{ matrix.language }}
-        # If you wish to specify custom queries, you can do so here or in a
-        # config file.
-        # By default, queries listed here will override any specified in a
-        # config file.
-        # Prefix the list here with "+" to use these queries and those in the
-        # config file.
-        # queries: ./path/to/local/query, your-org/your-repo/queries@main
-
-    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
-    # If this step fails, then you should remove it and run the build manually
-    # (see below)
-    - name: Autobuild
-      uses: github/codeql-action/autobuild@v2
-
-    # ℹ️ Command-line programs to run using the OS shell.
-    # 📚 https://git.io/JvXDl
-
-    # ✏️ If the Autobuild fails above, remove it and uncomment the following
-    #    three lines and modify them (or add more) to build your code if your
-    #    project uses a compiled language
-
-    # - run: |
-    #   make bootstrap
-    #   make release
-
-    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
+        app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
+        private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
+    - name: Get composite run steps repository
+      uses: actions/checkout@v3
+      with:
+        repository: mosaicml/testing
+        ref: v0.0.2
+        path: ./testing
+        token: ${{ steps.REPO_TOKEN.outputs.token }}
+    - uses: ./testing/.github/actions/codeql-analysis
+      with:
+        language: ${{ matrix.language }}
diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
index f89d67ec39..fe69f936ab 100644
--- a/.github/workflows/coverage.yaml
+++ b/.github/workflows/coverage.yaml
@@ -5,6 +5,11 @@ on:
       download-path:
         required: true
         type: string
+    secrets:
+      app_id:
+        required: true
+      private_key:
+        required: true
 jobs:
   coverage:
     timeout-minutes: 5
@@ -12,21 +17,19 @@ jobs:
     steps:
     - name: Checkout Repo
       uses: actions/checkout@v3
-    - name: Setup
-      run: |
-        set -ex
-        python -m pip install --upgrade 'pip<23' wheel
-        pip install coverage[toml]==6.5.0
-    - name: Download artifacts
-      uses: actions/download-artifact@v3
+    - name: Get Repo Token
+      id: REPO_TOKEN
+      uses: tibdex/github-app-token@v1
+      with:
+        app_id: ${{ secrets.app_id }}
+        private_key: ${{ secrets.private_key }}
+    - name: Get composite run steps repository
+      uses: actions/checkout@v3
+      with:
+        repository: mosaicml/testing
+        ref: v0.0.2
+        path: ./testing
+        token: ${{ steps.REPO_TOKEN.outputs.token }}
+    - uses: ./testing/.github/actions/coverage
       with:
-        path: ${{ inputs.download-path }}
-    - name: Generate coverage report
-      run: |
-        set -ex
-
-        # Flatten the coverage files
-        ls ${{ inputs.download-path }} | while read x; do mv ${{ inputs.download-path }}/$x/.coverage .coverage.$x; done
-
-        python -m coverage combine
-        python -m coverage report
+        download-path: ${{ inputs.download-path }}
diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index 3c65b0f4fa..3867a75b71 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -97,6 +97,9 @@ jobs:
     needs: [daily-pytest-cpu]
     with:
       download-path: artifacts
+    secrets:
+      app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
+      private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
 
   daily-pytest-gpu:
     uses: ./.github/workflows/pytest-gpu.yaml
diff --git a/.github/workflows/pr-code-quality.yaml b/.github/workflows/pr-code-quality.yaml
deleted file mode 100644
index 26d2546e75..0000000000
--- a/.github/workflows/pr-code-quality.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: PR Code Quality Checks
-on:
-  push:
-    branches:
-    - dev
-    - main
-    - release/**
-  pull_request:
-  workflow_dispatch:
-# Cancel old runs when a new commit is pushed to the same branch if not on main
-# or dev
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
-jobs:
-  code-quality:
-    uses: ./.github/workflows/code-quality.yaml
-    strategy:
-      matrix:
-        python_version:
-        - "3.9"
-        - "3.10"
-        - "3.11"
-        pip_deps:
-        - "[dev]"
-    with:
-      python_version: ${{ matrix.python_version }}
-      pip_deps: ${{ matrix.pip_deps }}
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 6eee54cb0b..d3e752c70e 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -1,6 +1,6 @@
 name: PR CPU tests
 on:
-  pull_request:
+  pull_request_target:
   workflow_dispatch:
 # Cancel old runs when a new commit is pushed to the same branch if not on main
 # or dev
@@ -43,3 +43,6 @@ jobs:
     needs: [pytest-cpu]
     with:
       download-path: artifacts
+    secrets:
+      app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
+      private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
index 429cc40b1d..59286a41aa 100644
--- a/.github/workflows/smoketest.yaml
+++ b/.github/workflows/smoketest.yaml
@@ -5,7 +5,7 @@ on:
     - dev
     - main
     - release/**
-  pull_request:
+  pull_request_target:
   workflow_call:
   workflow_dispatch:
 # Cancel old runs when a new commit is pushed to the same branch if not on main
@@ -27,16 +27,21 @@ jobs:
         - "3.10"
         - "3.11"
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v4
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Get Repo Token
+      id: REPO_TOKEN
+      uses: tibdex/github-app-token@v1
       with:
-        python-version: ${{ matrix.python_version }}
-    - name: Setup
-      run: |
-        set -ex
-        python -m pip install --upgrade 'pip<23' wheel
-        python -m pip install --upgrade .
-        python -m pip install pytest==7.2.1 pytest_codeblocks==0.16.1
-    - name: Run checks
-      run: |
-        pytest tests/test_smoketest.py
+        app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
+        private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
+    - name: Get composite run steps repository
+      uses: actions/checkout@v3
+      with:
+        repository: mosaicml/testing
+        ref: v0.0.2
+        path: ./testing
+        token: ${{ steps.REPO_TOKEN.outputs.token }}
+    - uses: ./testing/.github/actions/smoketest
+      with:
+        python_version: ${{ matrix.python_version }}

From 368543bb212dd5d33362de7550276de5e448f6f5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 19 Feb 2024 17:53:36 -0500
Subject: [PATCH 07/28] Bump gitpython from 3.1.41 to 3.1.42 (#3031)

Bumps [gitpython](https://github.com/gitpython-developers/GitPython) from 3.1.41 to 3.1.42.
- [Release notes](https://github.com/gitpython-developers/GitPython/releases)
- [Changelog](https://github.com/gitpython-developers/GitPython/blob/main/CHANGES)
- [Commits](https://github.com/gitpython-developers/GitPython/compare/3.1.41...3.1.42)

---
updated-dependencies:
- dependency-name: gitpython
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6600f716a7..71d54ef3db 100644
--- a/setup.py
+++ b/setup.py
@@ -134,7 +134,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'nbsphinx==0.9.1',
     'pandoc==2.3',
     'pypandoc==1.12',
-    'GitPython==3.1.41',
+    'GitPython==3.1.42',
     'moto[s3]>=4.0.1,<5',
     'mock-ssh-server==0.9.1',
     'cryptography==41.0.5',

From 0d14a954ed636cc3ec8dc1f4101b0a1f35d05249 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 19 Feb 2024 17:54:29 -0500
Subject: [PATCH 08/28] Bump yamllint from 1.34.0 to 1.35.1 (#3034)

Bumps [yamllint](https://github.com/adrienverge/yamllint) from 1.34.0 to 1.35.1.
- [Changelog](https://github.com/adrienverge/yamllint/blob/master/CHANGELOG.rst)
- [Commits](https://github.com/adrienverge/yamllint/compare/v1.34.0...v1.35.1)

---
updated-dependencies:
- dependency-name: yamllint
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 71d54ef3db..5a2e1d829d 100644
--- a/setup.py
+++ b/setup.py
@@ -106,7 +106,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'ipython==8.11.0',
     'ipykernel==6.29.2',
     'jupyter==1.0.0',
-    'yamllint==1.34.0',
+    'yamllint==1.35.1',
     'recommonmark==0.7.1',
     'sphinx==4.4.0',
     'pre-commit>=3.4.0,<4',

From 6f547d0b8a196653c2ac7d9e512537624b8e9827 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 19 Feb 2024 17:54:54 -0500
Subject: [PATCH 09/28] Update torchmetrics requirement from <1.3.1,>=0.10.0 to
 >=0.10.0,<1.3.2 (#3035)

Updates the requirements on [torchmetrics](https://github.com/Lightning-AI/torchmetrics) to permit the latest version.
- [Release notes](https://github.com/Lightning-AI/torchmetrics/releases)
- [Changelog](https://github.com/Lightning-AI/torchmetrics/blob/master/CHANGELOG.md)
- [Commits](https://github.com/Lightning-AI/torchmetrics/compare/v0.10.0...v1.3.1)

---
updated-dependencies:
- dependency-name: torchmetrics
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 5a2e1d829d..7122a38eeb 100644
--- a/setup.py
+++ b/setup.py
@@ -76,7 +76,7 @@ def package_files(prefix: str, directory: str, extension: str):
 install_requires = [
     'pyyaml>=6.0,<7',
     'tqdm>=4.62.3,<5',
-    'torchmetrics>=0.10.0,<1.3.1',
+    'torchmetrics>=0.10.0,<1.3.2',
     'torch_optimizer>=0.3.0,<0.4',
     'torchvision>=0.13.1,<0.20',  # TODO: Tighten before release
     'torch>=2.0.1,<2.3.1',  # TODO: Tighten before release

From 2f684ea3ad43b85c7f8eccf361da4ce6be827552 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 19 Feb 2024 22:25:45 -0500
Subject: [PATCH 10/28] Bump pypandoc from 1.12 to 1.13 (#3033)

Bumps [pypandoc](https://github.com/JessicaTegner/pypandoc) from 1.12 to 1.13.
- [Release notes](https://github.com/JessicaTegner/pypandoc/releases)
- [Changelog](https://github.com/JessicaTegner/pypandoc/blob/master/release.md)
- [Commits](https://github.com/JessicaTegner/pypandoc/compare/v1.12...v1.13)

---
updated-dependencies:
- dependency-name: pypandoc
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7122a38eeb..f77ff65036 100644
--- a/setup.py
+++ b/setup.py
@@ -133,7 +133,7 @@ def package_files(prefix: str, directory: str, extension: str):
     'traitlets==5.14.1',
     'nbsphinx==0.9.1',
     'pandoc==2.3',
-    'pypandoc==1.12',
+    'pypandoc==1.13',
     'GitPython==3.1.42',
     'moto[s3]>=4.0.1,<5',
     'mock-ssh-server==0.9.1',

From e12cbe3bc98b5bb7319afe4a1825fedd00f79185 Mon Sep 17 00:00:00 2001
From: Tanguy <tanguycizain@gmail.com>
Date: Tue, 20 Feb 2024 16:17:08 +0100
Subject: [PATCH 11/28] Add tensorboard images support (#3021)

* Add tensorboard images support

* address review
---
 composer/loggers/tensorboard_logger.py   | 39 ++++++++++++++++++-
 tests/loggers/test_tensorboard_logger.py | 49 ++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 tests/loggers/test_tensorboard_logger.py

diff --git a/composer/loggers/tensorboard_logger.py b/composer/loggers/tensorboard_logger.py
index 7f98a62e36..cf7bbfdc93 100644
--- a/composer/loggers/tensorboard_logger.py
+++ b/composer/loggers/tensorboard_logger.py
@@ -4,7 +4,10 @@
 """Log to `Tensorboard <https://www.tensorflow.org/tensorboard/>`_."""
 
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Sequence, Union
+
+import numpy as np
+import torch
 
 from composer.core.state import State
 from composer.loggers.logger import Logger, format_log_data_value
@@ -136,6 +139,30 @@ def eval_end(self, state: State, logger: Logger) -> None:
     def fit_end(self, state: State, logger: Logger) -> None:
         self._flush(logger)
 
+    def log_images(
+        self,
+        images: Union[np.ndarray, torch.Tensor, Sequence[Union[np.ndarray, torch.Tensor]]],
+        name: str = 'Images',
+        channels_last: bool = False,
+        step: Optional[int] = None,
+        masks: Optional[Dict[str, Union[np.ndarray, torch.Tensor, Sequence[Union[np.ndarray, torch.Tensor]]]]] = None,
+        mask_class_labels: Optional[Dict[int, str]] = None,
+        use_table: bool = False,
+    ):
+        images = _convert_to_tensorboard_image(images)
+
+        assert self.writer is not None
+        if images.ndim <= 3:
+            assert images.ndim > 1
+            if images.ndim == 2:  # Assume 2D image
+                data_format = 'HW'
+            else:  # Assume 2D image with channels?
+                data_format = 'HWC' if channels_last else 'CHW'
+            self.writer.add_image(name, images, global_step=step, dataformats=data_format)
+            return
+
+        self.writer.add_images(name, images, global_step=step, dataformats='NHWC' if channels_last else 'NCHW')
+
     def _flush(self, logger: Logger):
         # To avoid empty files uploaded for each rank.
         if self.rank_zero_only and dist.get_global_rank() != 0:
@@ -164,3 +191,13 @@ def close(self, state: State, logger: Logger) -> None:
         del state  # unused
         self._flush(logger)
         self.writer = None
+
+
+def _convert_to_tensorboard_image(
+        t: Union[np.ndarray, torch.Tensor, Sequence[Union[np.ndarray, torch.Tensor]]]) -> np.ndarray:
+    if isinstance(t, torch.Tensor):
+        return t.to(torch.float16).cpu().numpy()
+    if isinstance(t, list):
+        return np.array([_convert_to_tensorboard_image(image) for image in t])
+    assert isinstance(t, np.ndarray)
+    return t
diff --git a/tests/loggers/test_tensorboard_logger.py b/tests/loggers/test_tensorboard_logger.py
new file mode 100644
index 0000000000..ab61b905fd
--- /dev/null
+++ b/tests/loggers/test_tensorboard_logger.py
@@ -0,0 +1,49 @@
+# Copyright 2024 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Sequence
+
+import pytest
+import torch
+
+from composer.loggers import Logger, TensorboardLogger
+
+
+@pytest.fixture
+def test_tensorboard_logger(tmp_path, dummy_state):
+    pytest.importorskip('tensorboard', reason='tensorboard is optional')
+    dummy_state.run_name = 'tensorboard-test-log-image'
+    logger = Logger(dummy_state, [])
+    tensorboard_logger = TensorboardLogger(log_dir=str(tmp_path))
+    tensorboard_logger.init(dummy_state, logger)
+    return tensorboard_logger
+
+
+def test_tensorboard_log_image(test_tensorboard_logger):
+    pytest.importorskip('tensorboard', reason='tensorboard is optional')
+
+    image_variants = [
+        (torch.rand(4, 4), False),  # 2D image
+        (torch.rand(2, 3, 4, 4), False),  # multiple images, not channels last
+        (torch.rand(2, 3, 4, 4, dtype=torch.bfloat16), False),  # same as above but with bfloat16
+        (torch.rand(3, 4, 4), False),  # with channels, not channels last
+        ([torch.rand(4, 4, 3)], True),  # with channels, channels last
+        (torch.rand(2, 4, 4, 3), True),  # multiple images, channels last
+        ([torch.rand(4, 4, 3), torch.rand(4, 4, 3)], True)  # multiple images in list
+    ]
+
+    for idx, (images, channels_last) in enumerate(image_variants):
+        if isinstance(images, Sequence):
+            np_images = [image.to(torch.float32).numpy() for image in images]
+
+        else:
+            np_images = images.to(torch.float32).numpy()
+        test_tensorboard_logger.log_images(name='Image ' + str(idx) + ' tensor',
+                                           images=images,
+                                           channels_last=channels_last)
+        test_tensorboard_logger.log_images(name='Image ' + str(idx) + ' np',
+                                           images=np_images,
+                                           channels_last=channels_last)
+
+    test_tensorboard_logger.post_close()
+    # Tensorboard images are stored inline, so we can't check them automatically.

From b328599fcc3b51a102eb78bb09c3e12a387be445 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 20 Feb 2024 16:15:35 -0500
Subject: [PATCH 12/28] add sorted (#3036)

---
 composer/utils/checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py
index c1fe956192..e74047cf22 100644
--- a/composer/utils/checkpoint.py
+++ b/composer/utils/checkpoint.py
@@ -301,7 +301,7 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner):
             log.debug(f'Rank {dist.get_global_rank()} finished transferring files to all ranks.')
             dist.barrier()
             log.debug(
-                f'Done waiting for all ranks to finish transferring files. Local checkpoint files: {os.listdir(self.destination_path)}'
+                f'Done waiting for all ranks to finish transferring files. Local checkpoint files: {sorted(os.listdir(self.destination_path))}'
             )
 
         # 5. Piggyback off of the FileSystemReader to read all the files now that they are downloaded.

From cc799e50191c742a1f205292ee4c12c9ec487d74 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 20 Feb 2024 17:26:01 -0500
Subject: [PATCH 13/28] Friendlier device mesh error (#3039)

* new test

* fix

* fix

* fix error
---
 composer/trainer/dist_strategy.py | 30 +++++++++++++++++++-----------
 tests/trainer/test_fsdp.py        | 10 ++++++++++
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/composer/trainer/dist_strategy.py b/composer/trainer/dist_strategy.py
index 8b76f8b1ba..a0e9a38656 100644
--- a/composer/trainer/dist_strategy.py
+++ b/composer/trainer/dist_strategy.py
@@ -13,7 +13,7 @@
 from packaging import version
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (CheckpointImpl, apply_activation_checkpointing,
                                                                          checkpoint_wrapper)
-from torch.distributed.fsdp import FullyShardedDataParallel
+from torch.distributed.fsdp import FullyShardedDataParallel, ShardingStrategy
 from torch.distributed.fsdp._common_utils import clean_tensor_name
 from torch.nn.parallel import DistributedDataParallel
 from torchmetrics import Metric, MetricCollection
@@ -248,16 +248,7 @@ def sync_hook(*args):
         if found_cuda_oom == 1:
             raise RuntimeError('CUDA out of memory encountered on a different rank')
 
-    kwargs = {}
-    if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.2.0'):
-        if 'device_mesh' in fsdp_config:
-            from torch.distributed._tensor import init_device_mesh
-            kwargs['device_mesh'] = init_device_mesh(
-                'cuda',
-                tuple([int(x) for x in fsdp_config['device_mesh']]),
-            )
-
-    # necessary variables for optimizers with multiple param groups in FSDP
+    # Necessary variables for optimizers with multiple param groups in FSDP
     num_param_groups = None
     param_name_to_group_num = None
     group_num_to_param_group_info = None
@@ -308,6 +299,23 @@ def sync_hook(*args):
     sharding_map_key = fsdp_config['sharding_strategy'].upper()
     sharding_strategy = SHARDING_MAP[sharding_map_key]
 
+    kwargs = {}
+    if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.2.0'):
+        if 'device_mesh' in fsdp_config:
+            device_mesh_size = len(fsdp_config['device_mesh'])
+            if sharding_strategy in [ShardingStrategy.FULL_SHARD, ShardingStrategy.NO_SHARD] and device_mesh_size != 1:
+                raise ValueError(f'FSDP sharding strategy {sharding_map_key.upper()} requires a device mesh '
+                                 f'of size 1 but got device mesh size of {device_mesh_size}.')
+            elif sharding_strategy in [ShardingStrategy.HYBRID_SHARD, ShardingStrategy._HYBRID_SHARD_ZERO2
+                                      ] and device_mesh_size != 2:
+                raise ValueError(f'FSDP sharding strategy {sharding_map_key.upper()} requires a device mesh '
+                                 f'of size 2 but got device mesh size of {device_mesh_size}.')
+            from torch.distributed._tensor import init_device_mesh
+            kwargs['device_mesh'] = init_device_mesh(
+                'cuda',
+                tuple([int(x) for x in fsdp_config['device_mesh']]),
+            )
+
     cpu_offload = get_cpu_offload(cpu_offload=fsdp_config['cpu_offload'])
 
     mixed_precision = fsdp_config['mixed_precision']
diff --git a/tests/trainer/test_fsdp.py b/tests/trainer/test_fsdp.py
index c6f5258c49..21d6bbbd52 100644
--- a/tests/trainer/test_fsdp.py
+++ b/tests/trainer/test_fsdp.py
@@ -216,6 +216,16 @@ def test_fsdp_process_group(world_size: int):
     trainer.fit()
 
 
+@pytest.mark.gpu
+@world_size(2)
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.2.0'), reason='Device mesh requires Torch 2.2')
+def test_wrong_size_device_mesh_error(world_size: int):
+    with pytest.raises(ValueError, match='.*requires a device mesh of size 1.*'):
+        Trainer(model=SimpleModel(), fsdp_config={
+            'device_mesh': [1, 2],
+        })
+
+
 class SimpleMLP(ComposerModel):
 
     def __init__(self, num_features: int = 128, device: str = 'cuda'):

From 7855cc73acc3aba8041cb20bb1e6da63137faddb Mon Sep 17 00:00:00 2001
From: Charles Tang <j316chuck@users.noreply.github.com>
Date: Tue, 20 Feb 2024 17:18:34 -0800
Subject: [PATCH 14/28] Update torch nightly aws to python3.11 (#3038)

---
 docker/README.md                |  3 +--
 docker/build_matrix.yaml        | 33 +++------------------------------
 docker/generate_build_matrix.py | 27 +++++----------------------
 3 files changed, 9 insertions(+), 54 deletions(-)

diff --git a/docker/README.md b/docker/README.md
index c617567f2f..e66ab3049f 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -31,8 +31,7 @@ To install composer, once inside the image, run `pip install mosaicml`.
 | Linux Distro   | Flavor   | PyTorch Version   | CUDA Version        | Python Version   | Docker Tags                                                                              |
 |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------|
 | Ubuntu 20.04   | Base     | 2.3.0             | 12.1.0 (Infiniband) | 3.11             | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04`                    |
-| Ubuntu 20.04   | Base     | 2.3.0             | 12.1.0 (Infiniband) | 3.10             | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04`                    |
-| Ubuntu 20.04   | Base     | 2.3.0             | 12.1.0 (EFA)        | 3.10             | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04-aws`                |
+| Ubuntu 20.04   | Base     | 2.3.0             | 12.1.0 (EFA)        | 3.11             | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04-aws`                |
 | Ubuntu 20.04   | Base     | 2.2.0             | 12.1.0 (Infiniband) | 3.11             | `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04`                                    |
 | Ubuntu 20.04   | Base     | 2.2.0             | 12.1.0 (EFA)        | 3.11             | `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04-aws`                                |
 | Ubuntu 20.04   | Base     | 2.2.0             | cpu                 | 3.11             | `mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04`                                      |
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index 700bd4c010..13803e45db 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -193,7 +193,7 @@
 - AWS_OFI_NCCL_VERSION: v1.7.4-aws
   BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.0
-  IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121-python3-10-aws
+  IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121-python3-11-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -209,39 +209,12 @@
     brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
     brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
     brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
-  PYTHON_VERSION: '3.10'
-  PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121
-  PYTORCH_NIGHTLY_VERSION: dev20240110+cu121
-  PYTORCH_VERSION: 2.3.0
-  TAGS:
-  - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04-aws
-  TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.18.0
-- AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 12.1.0
-  IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121-python3-10
-  MOFED_VERSION: 5.5-1.0.3.2
-  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
-    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
-    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
-    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
-    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
-    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
-    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
-    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
-    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
-    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
-    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
-    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
-    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
-    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
-  PYTHON_VERSION: '3.10'
+  PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121
   PYTORCH_NIGHTLY_VERSION: dev20240110+cu121
   PYTORCH_VERSION: 2.3.0
   TAGS:
-  - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04
+  - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04-aws
   TARGET: pytorch_stage
   TORCHVISION_VERSION: 0.18.0
 - AWS_OFI_NCCL_VERSION: ''
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index 333010304b..d59219f811 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -224,39 +224,22 @@ def _main():
 
         pytorch_entries.append(entry)
 
-    nightly_entry_310_aws = {
+    nightly_entry_311_aws = {
         'AWS_OFI_NCCL_VERSION': 'v1.7.4-aws',
         'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04',
         'CUDA_VERSION': '12.1.0',
-        'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121-python3-10-aws',
+        'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121-python3-11-aws',
         'MOFED_VERSION': '',
         'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'),
-        'PYTHON_VERSION': '3.10',
-        'PYTORCH_VERSION': '2.3.0',
-        'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121',
-        'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121',
-        'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04-aws'],
-        'TARGET': 'pytorch_stage',
-        'TORCHVISION_VERSION': '0.18.0'
-    }
-    pytorch_entries.append(nightly_entry_310_aws)
-
-    nightly_entry_310 = {
-        'AWS_OFI_NCCL_VERSION': '',
-        'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04',
-        'CUDA_VERSION': '12.1.0',
-        'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121-python3-10',
-        'MOFED_VERSION': '5.5-1.0.3.2',
-        'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'),
-        'PYTHON_VERSION': '3.10',
+        'PYTHON_VERSION': '3.11',
         'PYTORCH_VERSION': '2.3.0',
         'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121',
         'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121',
-        'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04'],
+        'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04-aws'],
         'TARGET': 'pytorch_stage',
         'TORCHVISION_VERSION': '0.18.0'
     }
-    pytorch_entries.append(nightly_entry_310)
+    pytorch_entries.append(nightly_entry_311_aws)
 
     nightly_entry_311 = {
         'AWS_OFI_NCCL_VERSION': '',

From c2a2b7b43a24927e2c1802b0958923e58fc17a6e Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 21 Feb 2024 14:51:46 -0500
Subject: [PATCH 15/28] Download symlink once (#3043)

* download symlink once

* lint
---
 composer/trainer/trainer.py | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index 01cd0fcc9b..246c1b393b 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -1431,23 +1431,26 @@ def __init__(
             assert latest_remote_file_name is not None
             if self.state.fsdp_sharded_state_dict_enabled:
                 ar_object_store = maybe_create_object_store_from_uri(save_folder)
-                # Symlink is on object store.
+                # Symlink is on object store
                 if ar_object_store is not None:
-                    with tempfile.TemporaryDirectory() as temp_dir:
-                        local_symlink_file = str(Path(temp_dir) / Path('autoresume.symlink'))
-                        formatted_latest_remote_file_name = format_name_with_dist(latest_remote_file_name,
-                                                                                  self.state.run_name) + '.symlink'
-                        rank0_formatted_latest_remote_file_name = dist.all_gather_object(
-                            formatted_latest_remote_file_name)[0]
-                        try:
-                            ar_object_store.download_object(rank0_formatted_latest_remote_file_name, local_symlink_file)
-                            with open(local_symlink_file, 'r') as f:
-                                real_path = f.read()
-                                log.debug(f'Read path {real_path} from symlink file')
-                            autoresume_checkpoint_path = ar_object_store.get_uri(real_path)
-                        except FileNotFoundError:
-                            autoresume_checkpoint_path = None
-                # Symlink is local.
+                    autoresume_checkpoint_path = None
+                    if dist.get_global_rank() == 0:
+                        with tempfile.TemporaryDirectory() as temp_dir:
+                            local_symlink_file = str(Path(temp_dir) / Path('autoresume.symlink'))
+                            symlink_file_name = format_name_with_dist(latest_remote_file_name,
+                                                                      self.state.run_name) + '.symlink'
+                            try:
+                                ar_object_store.download_object(symlink_file_name, local_symlink_file)
+                                with open(local_symlink_file, 'r') as f:
+                                    real_path = f.read()
+                                    log.debug(f'Read path {real_path} from symlink file')
+                                autoresume_checkpoint_path = ar_object_store.get_uri(real_path)
+                            except FileNotFoundError:
+                                pass
+                    autoresume_path_list = [autoresume_checkpoint_path]
+                    dist.broadcast_object_list(autoresume_path_list)
+                    autoresume_checkpoint_path = autoresume_path_list[0]
+                # Symlink is local
                 else:
                     save_latest_filename = format_name_with_dist(save_latest_filename, self.state.run_name)
                     rank0_save_latest_filename = dist.all_gather_object(save_latest_filename)[0]
@@ -1460,7 +1463,7 @@ def __init__(
                     else:
                         autoresume_checkpoint_path = None
 
-            # Standard non-elastic codepath for autoresume.
+            # Standard non-elastic codepath for autoresume
             else:
                 autoresume_checkpoint_path = self._get_autoresume_checkpoint(
                     save_folder=save_folder,

From cda979739d9bbc74ff3c72d4aa4bd61017ed0905 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 21 Feb 2024 16:38:41 -0500
Subject: [PATCH 16/28] Add min size to OCI download (#3044)

* add min size

* logs

* log info

* swap

* min 1

* remove log
---
 composer/utils/object_store/oci_object_store.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/composer/utils/object_store/oci_object_store.py b/composer/utils/object_store/oci_object_store.py
index d36b13e03b..72898464cc 100644
--- a/composer/utils/object_store/oci_object_store.py
+++ b/composer/utils/object_store/oci_object_store.py
@@ -137,6 +137,7 @@ def download_object(
         filename: Union[str, pathlib.Path],
         overwrite: bool = False,
         callback: Optional[Callable[[int, int], None]] = None,
+        min_part_size: int = 128000000,
         num_parts: int = 10,
     ):
         del callback
@@ -151,11 +152,14 @@ def download_object(
         object_size = 0
         try:
             head_object_response = self.client.head_object(self.namespace, self.bucket, object_name)
-            object_size = head_object_response.headers['content-length']  # pyright: ignore[reportOptionalMemberAccess]
+            object_size = int(head_object_response.headers['content-length'])  # pyright: ignore[reportOptionalMemberAccess]
         except Exception as e:
             _reraise_oci_errors(self.get_uri(object_name), e)
+
         # Calculate the part sizes
-        base_part_size, remainder = divmod(int(object_size), num_parts)
+        num_parts_from_size = max(object_size // min_part_size, 1)
+        num_parts = min(num_parts, num_parts_from_size)
+        base_part_size, remainder = divmod(object_size, num_parts)
         part_sizes = [base_part_size] * num_parts
         for i in range(remainder):
             part_sizes[i] += 1

From 59a83d4aaa5ca202d2d644c81828f168a6d1f54c Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 21 Feb 2024 20:44:02 -0500
Subject: [PATCH 17/28] lint (#3045)

---
 composer/utils/object_store/oci_object_store.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/composer/utils/object_store/oci_object_store.py b/composer/utils/object_store/oci_object_store.py
index 72898464cc..cf4b78bdbd 100644
--- a/composer/utils/object_store/oci_object_store.py
+++ b/composer/utils/object_store/oci_object_store.py
@@ -152,7 +152,8 @@ def download_object(
         object_size = 0
         try:
             head_object_response = self.client.head_object(self.namespace, self.bucket, object_name)
-            object_size = int(head_object_response.headers['content-length'])  # pyright: ignore[reportOptionalMemberAccess]
+            object_size = int(
+                head_object_response.headers['content-length'])  # pyright: ignore[reportOptionalMemberAccess]
         except Exception as e:
             _reraise_oci_errors(self.get_uri(object_name), e)
 

From a606314d5621ae1eec6b9a9799e6795ae7abb17b Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 21 Feb 2024 20:56:23 -0500
Subject: [PATCH 18/28] Revert "Change to pull_request_target  (#3025)" (#3047)

This reverts commit 7d6afe17a6fdcb97863ff7776502babfe2056d03.
---
 .github/workflows/code-quality.yaml    | 57 +++++++++-----------------
 .github/workflows/codeql-analysis.yml  | 49 ++++++++++++++--------
 .github/workflows/coverage.yaml        | 37 ++++++++---------
 .github/workflows/daily.yaml           |  3 --
 .github/workflows/pr-code-quality.yaml | 28 +++++++++++++
 .github/workflows/pr-cpu.yaml          |  5 +--
 .github/workflows/smoketest.yaml       | 31 ++++++--------
 7 files changed, 111 insertions(+), 99 deletions(-)
 create mode 100644 .github/workflows/pr-code-quality.yaml

diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
index 9e00def203..20bbf327b7 100644
--- a/.github/workflows/code-quality.yaml
+++ b/.github/workflows/code-quality.yaml
@@ -1,17 +1,13 @@
 name: Code Quality Checks
 on:
-  push:
-    branches:
-    - dev
-    - main
-    - release/**
-  pull_request_target:
-  workflow_dispatch:
-# Cancel old runs when a new commit is pushed to the same branch if not on main
-# or dev
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
+  workflow_call:
+    inputs:
+      python_version:
+        required: true
+        type: string
+      pip_deps:
+        required: true
+        type: string
 defaults:
   run:
     working-directory: .
@@ -19,31 +15,16 @@ jobs:
   code-quality:
     runs-on: ubuntu-20.04
     timeout-minutes: 15
-    strategy:
-      matrix:
-        python_version:
-        - "3.9"
-        - "3.10"
-        - "3.11"
-        pip_deps:
-        - "[dev]"
     steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-    - name: Get Repo Token
-      id: REPO_TOKEN
-      uses: tibdex/github-app-token@v1
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
       with:
-        app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
-        private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
-    - name: Get composite run steps repository
-      uses: actions/checkout@v3
-      with:
-        repository: mosaicml/testing
-        ref: v0.0.2
-        path: ./testing
-        token: ${{ steps.REPO_TOKEN.outputs.token }}
-    - uses: ./testing/.github/actions/code-quality
-      with:
-        python_version: ${{ matrix.python_version }}
-        pip_deps: ${{ matrix.pip_deps }}
+        python-version: ${{ inputs.python_version }}
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        python -m pip install --upgrade .${{ inputs.pip_deps }}
+    - name: Run checks
+      run: |
+        pre-commit run --all-files
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 511c70aa85..151179d524 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -14,7 +14,7 @@ name: "CodeQL"
 on:
   push:
     branches: [dev, main]
-  pull_request_target:
+  pull_request:
     # The branches below must be a subset of the branches above
     branches: [dev, main]
   schedule:
@@ -41,19 +41,36 @@ jobs:
     steps:
     - name: Checkout repository
       uses: actions/checkout@v2
-    - name: Get Repo Token
-      id: REPO_TOKEN
-      uses: tibdex/github-app-token@v1
-      with:
-        app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
-        private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
-    - name: Get composite run steps repository
-      uses: actions/checkout@v3
-      with:
-        repository: mosaicml/testing
-        ref: v0.0.2
-        path: ./testing
-        token: ${{ steps.REPO_TOKEN.outputs.token }}
-    - uses: ./testing/.github/actions/codeql-analysis
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v2
       with:
-        language: ${{ matrix.language }}
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a
+        # config file.
+        # By default, queries listed here will override any specified in a
+        # config file.
+        # Prefix the list here with "+" to use these queries and those in the
+        # config file.
+        # queries: ./path/to/local/query, your-org/your-repo/queries@main
+
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually
+    # (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v2
+
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 https://git.io/JvXDl
+
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following
+    #    three lines and modify them (or add more) to build your code if your
+    #    project uses a compiled language
+
+    # - run: |
+    #   make bootstrap
+    #   make release
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v2
diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
index fe69f936ab..f89d67ec39 100644
--- a/.github/workflows/coverage.yaml
+++ b/.github/workflows/coverage.yaml
@@ -5,11 +5,6 @@ on:
       download-path:
         required: true
         type: string
-    secrets:
-      app_id:
-        required: true
-      private_key:
-        required: true
 jobs:
   coverage:
     timeout-minutes: 5
@@ -17,19 +12,21 @@ jobs:
     steps:
     - name: Checkout Repo
       uses: actions/checkout@v3
-    - name: Get Repo Token
-      id: REPO_TOKEN
-      uses: tibdex/github-app-token@v1
-      with:
-        app_id: ${{ secrets.app_id }}
-        private_key: ${{ secrets.private_key }}
-    - name: Get composite run steps repository
-      uses: actions/checkout@v3
-      with:
-        repository: mosaicml/testing
-        ref: v0.0.2
-        path: ./testing
-        token: ${{ steps.REPO_TOKEN.outputs.token }}
-    - uses: ./testing/.github/actions/coverage
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        pip install coverage[toml]==6.5.0
+    - name: Download artifacts
+      uses: actions/download-artifact@v3
       with:
-        download-path: ${{ inputs.download-path }}
+        path: ${{ inputs.download-path }}
+    - name: Generate coverage report
+      run: |
+        set -ex
+
+        # Flatten the coverage files
+        ls ${{ inputs.download-path }} | while read x; do mv ${{ inputs.download-path }}/$x/.coverage .coverage.$x; done
+
+        python -m coverage combine
+        python -m coverage report
diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index 3867a75b71..3c65b0f4fa 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -97,9 +97,6 @@ jobs:
     needs: [daily-pytest-cpu]
     with:
       download-path: artifacts
-    secrets:
-      app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
-      private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
 
   daily-pytest-gpu:
     uses: ./.github/workflows/pytest-gpu.yaml
diff --git a/.github/workflows/pr-code-quality.yaml b/.github/workflows/pr-code-quality.yaml
new file mode 100644
index 0000000000..26d2546e75
--- /dev/null
+++ b/.github/workflows/pr-code-quality.yaml
@@ -0,0 +1,28 @@
+name: PR Code Quality Checks
+on:
+  push:
+    branches:
+    - dev
+    - main
+    - release/**
+  pull_request:
+  workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main
+# or dev
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
+jobs:
+  code-quality:
+    uses: ./.github/workflows/code-quality.yaml
+    strategy:
+      matrix:
+        python_version:
+        - "3.9"
+        - "3.10"
+        - "3.11"
+        pip_deps:
+        - "[dev]"
+    with:
+      python_version: ${{ matrix.python_version }}
+      pip_deps: ${{ matrix.pip_deps }}
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index d3e752c70e..6eee54cb0b 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -1,6 +1,6 @@
 name: PR CPU tests
 on:
-  pull_request_target:
+  pull_request:
   workflow_dispatch:
 # Cancel old runs when a new commit is pushed to the same branch if not on main
 # or dev
@@ -43,6 +43,3 @@ jobs:
     needs: [pytest-cpu]
     with:
       download-path: artifacts
-    secrets:
-      app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
-      private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
index 59286a41aa..429cc40b1d 100644
--- a/.github/workflows/smoketest.yaml
+++ b/.github/workflows/smoketest.yaml
@@ -5,7 +5,7 @@ on:
     - dev
     - main
     - release/**
-  pull_request_target:
+  pull_request:
   workflow_call:
   workflow_dispatch:
 # Cancel old runs when a new commit is pushed to the same branch if not on main
@@ -27,21 +27,16 @@ jobs:
         - "3.10"
         - "3.11"
     steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-    - name: Get Repo Token
-      id: REPO_TOKEN
-      uses: tibdex/github-app-token@v1
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
       with:
-        app_id: ${{ secrets.DEVOGE_BOT_PUB_APPID }}
-        private_key: ${{ secrets.DEVOGE_BOT_PUB_PEM }}
-    - name: Get composite run steps repository
-      uses: actions/checkout@v3
-      with:
-        repository: mosaicml/testing
-        ref: v0.0.2
-        path: ./testing
-        token: ${{ steps.REPO_TOKEN.outputs.token }}
-    - uses: ./testing/.github/actions/smoketest
-      with:
-        python_version: ${{ matrix.python_version }}
+        python-version: ${{ matrix.python_version }}
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        python -m pip install --upgrade .
+        python -m pip install pytest==7.2.1 pytest_codeblocks==0.16.1
+    - name: Run checks
+      run: |
+        pytest tests/test_smoketest.py

From 2133c17740d7aaa4368909409712a6be6dabc3a6 Mon Sep 17 00:00:00 2001
From: bigning <ning.wang@databricks.com>
Date: Wed, 21 Feb 2024 18:27:50 -0800
Subject: [PATCH 19/28] [fix auto-microbatch] FSDP reshard and cleanup after
 OOM to fix the cuda memory leak (#3030)

* reshard and cleanup

* format

* fix

* cleanup unit test

* comments

* more test

* fix the warning

* add numerical correctness test

* Apply suggestions from code review

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>

* lint

* fix test warnning

* revert irrelevant change

---------

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 composer/trainer/trainer.py |  18 ++++++
 tests/trainer/test_fsdp.py  | 113 +++++++++++++++++++++++++++++++++++-
 2 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index 246c1b393b..7411dc4393 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -30,6 +30,8 @@
 import torch.utils.data
 from torch._dynamo import OptimizedModule
 from torch.cuda.amp.grad_scaler import GradScaler, _refresh_per_optimizer_state
+from torch.distributed.fsdp import FullyShardedDataParallel
+from torch.distributed.fsdp._runtime_utils import _post_backward_final_callback
 from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
 from torch.nn.parallel import DistributedDataParallel
 from torch.optim.lr_scheduler import LRScheduler
@@ -232,6 +234,21 @@ def _is_cuda_oom(e: RuntimeError):
     return False
 
 
+def _fsdp_reshard_and_cleanup(model: torch.nn.Module):
+    """Manually reshard and clean up FSDP model.
+
+    When an exception like OOM happens, _post_backward_final_callback, which
+    is registered as a backward callback, will not run. We manually call it to cleanup
+    loose memory.
+    """
+    for __, module in model.named_modules():
+        if isinstance(module, FullyShardedDataParallel):
+            if module.check_is_root():
+                # Only call _post_backward_final_callback on root module. It will
+                # traverse and reshard all FSDP sub-modules
+                _post_backward_final_callback(module, module)
+
+
 def _adjust_device_train_microbatch_size(state: State):
     """Adjust device_train_microbatch_size if we encounter OOM.
 
@@ -259,6 +276,7 @@ def _adjust_device_train_microbatch_size(state: State):
         optimizer.zero_grad(set_to_none=True)
     if state.scaler is not None:
         state.scaler._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+    _fsdp_reshard_and_cleanup(state.model)
     torch.cuda.empty_cache()
 
 
diff --git a/tests/trainer/test_fsdp.py b/tests/trainer/test_fsdp.py
index 21d6bbbd52..76c59c0cc1 100644
--- a/tests/trainer/test_fsdp.py
+++ b/tests/trainer/test_fsdp.py
@@ -10,7 +10,7 @@
 from torch.utils.data import DataLoader
 
 from composer.models import ComposerClassifier, ComposerModel
-from composer.trainer.trainer import Trainer
+from composer.trainer.trainer import Trainer, _fsdp_reshard_and_cleanup
 from composer.utils import dist
 from tests.common import (EmbeddedWeightTiedModel, RandomClassificationDataset, SimpleModel, SimpleWeightTiedModel,
                           world_size)
@@ -232,10 +232,11 @@ def __init__(self, num_features: int = 128, device: str = 'cuda'):
         super().__init__()
         self.fc1 = torch.nn.Linear(num_features, num_features, device=device, bias=False)
         self.fc2 = torch.nn.Linear(num_features, num_features, device=device, bias=False)
+        self.relu = torch.nn.ReLU()
 
     def forward(self, x):
         x = self.fc1(x)
-        x = torch.nn.ReLU(x)
+        x = self.relu(x)
         x = self.fc2(x)
         return x
 
@@ -282,3 +283,111 @@ def test_fsdp_act_ckpt_offload(
             assert isinstance(trainer.state.model.fc1._fsdp_wrapped_module, OffloadWrapper)
         else:
             assert not isinstance(trainer.state.model.fc1._fsdp_wrapped_module, CheckpointWrapper)
+
+
+@pytest.mark.gpu
+@world_size(2)
+def test_fsdp_reshard_after_oom(world_size: int):
+    model = SimpleMLP(num_features=128)
+    model.relu._fsdp_wrap = False  # pyright: ignore[reportGeneralTypeIssues]
+
+    def oom_hook(*args):
+        raise RuntimeError('CUDA out of memory.')
+
+    model.fc2.register_full_backward_hook(oom_hook)
+
+    trainer = Trainer(
+        model=model,
+        fsdp_config={},
+        max_duration='3ba',
+    )
+    fsdp_model = trainer.state.model
+
+    x = torch.rand([2, 128])
+    output = fsdp_model(x)
+    with pytest.raises(Exception):
+        # Backward triggers the fake OOM exception,
+        # which prevents fsdp reshard and cleanup
+        torch.sum(output).backward()
+
+    fc2_flat_param = fsdp_model.fc2._flat_param
+
+    # Without cleanup, model.fc2.flat_params is still in unshard state
+    # the full param is not freed
+    assert fc2_flat_param.data_ptr() != fc2_flat_param._local_shard.data_ptr()
+    assert fc2_flat_param._full_param_padded.numel() > 0
+
+    _fsdp_reshard_and_cleanup(fsdp_model)
+    assert fc2_flat_param.data_ptr() == fc2_flat_param._local_shard.data_ptr()
+    assert fc2_flat_param._full_param_padded._typed_storage()._size() == 0
+
+
+@pytest.mark.gpu
+@world_size(2)
+def test_fsdp_same_state_after_oom_reshard(world_size: int):
+    # Test numerical correctness after continuing to train with smaller batch size after OOM.
+    model = SimpleMLP(num_features=2)
+    model.fc1._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
+    model.fc2._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
+    model.relu._fsdp_wrap = False  # pyright: ignore[reportGeneralTypeIssues]
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+
+    trainer = Trainer(
+        model=model,
+        fsdp_config={},
+        dist_timeout=20,
+        optimizers=optimizer,
+        seed=1,
+    )
+    fsdp_model = trainer.state.model
+
+    state_dict = fsdp_model.state_dict()
+
+    oom_model = SimpleMLP(num_features=2)
+    oom_model.fc1._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
+    oom_model.fc2._fsdp_wrap = True  # pyright: ignore[reportGeneralTypeIssues]
+    oom_model.relu._fsdp_wrap = False  # pyright: ignore[reportGeneralTypeIssues]
+    oom_model_optimizer = torch.optim.SGD(oom_model.parameters(), lr=0.1)
+
+    def oom_hook(module, grad_input, grad_ouput):
+        if grad_ouput[0].shape[0] >= 4:
+            raise RuntimeError('CUDA out of memory.')
+
+    oom_handle = oom_model.fc2.register_full_backward_hook(oom_hook)
+    oom_trainer = Trainer(
+        model=oom_model,
+        fsdp_config={},
+        dist_timeout=20,
+        optimizers=oom_model_optimizer,
+        seed=1,
+    )
+
+    fsdp_oom_model = oom_trainer.state.model
+    fsdp_oom_model.load_state_dict(state_dict)
+
+    x = torch.rand([4, 2])
+
+    # Run fwd + bwd + optimizer on normal model
+    output_0 = fsdp_model(x)
+    torch.sum(output_0).backward()
+    optimizer.step()
+
+    # Run fwd + bwd + optimizer on OOM model
+    output = fsdp_oom_model(x)
+    with pytest.raises(Exception):
+        torch.sum(output).backward()
+    # Cleanup after OOM
+    _fsdp_reshard_and_cleanup(fsdp_oom_model)
+    oom_model_optimizer.zero_grad(set_to_none=True)
+
+    oom_handle.remove()
+    output = fsdp_oom_model(x)
+    torch.sum(output).backward()
+    oom_model_optimizer.step()
+
+    # Run another fwd on both model and check
+    # if output is the same
+    output_1 = fsdp_model(x)
+    output_2 = fsdp_oom_model(x)
+
+    assert torch.equal(output_1, output_2)

From ccb5e5647e658c16c1910b4d5694596b03cfb428 Mon Sep 17 00:00:00 2001
From: Charles Tang <j316chuck@users.noreply.github.com>
Date: Wed, 21 Feb 2024 18:37:15 -0800
Subject: [PATCH 20/28] Bump composer version (#3048)

---
 composer/_version.py            |  2 +-
 docker/README.md                |  4 ++--
 docker/build_matrix.yaml        | 12 ++++++------
 docker/generate_build_matrix.py |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/composer/_version.py b/composer/_version.py
index 6a46c95e08..e638cdc5a7 100644
--- a/composer/_version.py
+++ b/composer/_version.py
@@ -3,4 +3,4 @@
 
 """The Composer Version."""
 
-__version__ = '0.19.1'
+__version__ = '0.19.2'
diff --git a/docker/README.md b/docker/README.md
index e66ab3049f..b7f7832c7b 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the
 <!-- BEGIN_COMPOSER_BUILD_MATRIX -->
 | Composer Version   | CUDA Support   | Docker Tag                                                     |
 |--------------------|----------------|----------------------------------------------------------------|
-| 0.19.1             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.19.1`         |
-| 0.19.1             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.19.1_cpu` |
+| 0.19.2             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.19.2`         |
+| 0.19.2             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.19.2_cpu` |
 <!-- END_COMPOSER_BUILD_MATRIX -->
 
 **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index 13803e45db..765d6cf180 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -246,9 +246,9 @@
   TORCHVISION_VERSION: 0.18.0
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.19.1
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.19.2
   CUDA_VERSION: 12.1.0
-  IMAGE_NAME: composer-0-19-1
+  IMAGE_NAME: composer-0-19-2
   MOFED_VERSION: 5.5-1.0.3.2
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -269,15 +269,15 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.1.2
   TAGS:
-  - mosaicml/composer:0.19.1
+  - mosaicml/composer:0.19.2
   - mosaicml/composer:latest
   TARGET: composer_stage
   TORCHVISION_VERSION: 0.16.2
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.19.1
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.19.2
   CUDA_VERSION: ''
-  IMAGE_NAME: composer-0-19-1-cpu
+  IMAGE_NAME: composer-0-19-2-cpu
   MOFED_VERSION: 5.5-1.0.3.2
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.10'
@@ -285,7 +285,7 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.1.2
   TAGS:
-  - mosaicml/composer:0.19.1_cpu
+  - mosaicml/composer:0.19.2_cpu
   - mosaicml/composer:latest_cpu
   TARGET: composer_stage
   TORCHVISION_VERSION: 0.16.2
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index d59219f811..cd4b0e0f27 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -261,7 +261,7 @@ def _main():
     composer_entries = []
 
     # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images
-    composer_versions = ['0.19.1']  # Only build images for the latest composer version
+    composer_versions = ['0.19.2']  # Only build images for the latest composer version
     composer_python_versions = [PRODUCTION_PYTHON_VERSION]  # just build composer against the latest
 
     for product in itertools.product(composer_python_versions, composer_versions, cuda_options):

From c0a9697940de5ea3e144210d26d533c3a651ac60 Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bruce@2.7182.net>
Date: Thu, 22 Feb 2024 11:17:18 -0800
Subject: [PATCH 21/28] Update XLA support (#2964)

* Fix initialization and microbatching for TPUs

* add version check for PyTortch XLA >= 2.1
---
 composer/devices/device_tpu.py |  1 +
 composer/trainer/trainer.py    |  5 +++++
 composer/utils/dist.py         | 17 +++++++++++++++--
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/composer/devices/device_tpu.py b/composer/devices/device_tpu.py
index b91d1bc478..813fc49924 100644
--- a/composer/devices/device_tpu.py
+++ b/composer/devices/device_tpu.py
@@ -26,6 +26,7 @@ class DeviceTPU(Device):
     More details.
     """
 
+    dist_backend = 'xla'
     name = 'tpu'
 
     def __init__(self):
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index 7411dc4393..0d2349bf93 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -2567,6 +2567,11 @@ def _train_microbatch(self, use_grad_scaling: bool, current_batch_size: int,
                 microbatch_loss.mul_(microbatch_num_samples / current_batch_size)
                 microbatch_loss.backward(create_graph=self._backwards_create_graph)
 
+            if self.state.device.dist_backend == 'xla':
+                # For xla devices, the program between any pair of mark_steps() calls is compiled. With out this, the
+                # microbatching loop is unrolled, drastically increasing compile time.
+                xm.mark_step()
+
             self.engine.run_event(Event.AFTER_BACKWARD)
 
             # Use microbatch outputs to update training metrics
diff --git a/composer/utils/dist.py b/composer/utils/dist.py
index 65edb5e80c..5b8dd5df68 100644
--- a/composer/utils/dist.py
+++ b/composer/utils/dist.py
@@ -37,6 +37,7 @@
 import logging
 import os
 import pickle
+import sys
 import time
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, List, Optional, Sequence, TypeVar, Union, cast
@@ -44,8 +45,12 @@
 import torch
 import torch.distributed as dist
 import torch.utils.data
+from packaging import version
 
-from composer.utils.device import get_device, is_hpu_installed
+from composer.utils.device import get_device, is_hpu_installed, is_tpu_installed
+
+if is_tpu_installed():
+    import torch_xla
 
 if TYPE_CHECKING:
     from composer.devices import Device
@@ -534,7 +539,15 @@ def initialize_dist(device: Union[str, Device], timeout: float = 300.0):
 
     dist_env_vars_match_defaults = all(os.environ.get(k, v) == v for (k, v) in dist_env_var_defaults.items())
 
-    if dist_env_vars_match_defaults:
+    if device_obj.dist_backend == 'xla':
+        if not 'torch_xla' in sys.modules:
+            raise RuntimeError('PyTorch XLA package not found. In order to use XLA based devices '
+                               'PyTorch XLA must be installed.')
+        if version.parse(torch_xla.__version__) < version.parse('2.1.0'):
+            raise RuntimeError(f'PyTorch XLA version must be at least 2.1.0, found {torch_xla.__version__}.')
+        # XLA initialization requires the init_method to be set
+        dist.init_process_group(device_obj.dist_backend, init_method='xla://')
+    elif dist_env_vars_match_defaults:
         # Fill in the remaining single-rank variables
         os.environ.update(dist_env_var_defaults)
         dist.init_process_group(device_obj.dist_backend, store=dist.HashStore(), world_size=1, rank=0)

From d3987a0542b36ad8b07cab23ddc51679ac2cf61d Mon Sep 17 00:00:00 2001
From: Charles Tang <j316chuck@users.noreply.github.com>
Date: Thu, 22 Feb 2024 14:27:38 -0800
Subject: [PATCH 22/28] Bump composer version 0.20.0 (#3051)

---
 composer/_version.py            |  2 +-
 docker/README.md                |  4 ++--
 docker/build_matrix.yaml        | 12 ++++++------
 docker/generate_build_matrix.py |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/composer/_version.py b/composer/_version.py
index e638cdc5a7..cb43d310d0 100644
--- a/composer/_version.py
+++ b/composer/_version.py
@@ -3,4 +3,4 @@
 
 """The Composer Version."""
 
-__version__ = '0.19.2'
+__version__ = '0.20.0'
diff --git a/docker/README.md b/docker/README.md
index b7f7832c7b..d0624e2665 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the
 <!-- BEGIN_COMPOSER_BUILD_MATRIX -->
 | Composer Version   | CUDA Support   | Docker Tag                                                     |
 |--------------------|----------------|----------------------------------------------------------------|
-| 0.19.2             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.19.2`         |
-| 0.19.2             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.19.2_cpu` |
+| 0.20.0             | Yes            | `mosaicml/composer:latest`, `mosaicml/composer:0.20.0`         |
+| 0.20.0             | No             | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.20.0_cpu` |
 <!-- END_COMPOSER_BUILD_MATRIX -->
 
 **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index 765d6cf180..21c36347e9 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -246,9 +246,9 @@
   TORCHVISION_VERSION: 0.18.0
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.19.2
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.20.0
   CUDA_VERSION: 12.1.0
-  IMAGE_NAME: composer-0-19-2
+  IMAGE_NAME: composer-0-20-0
   MOFED_VERSION: 5.5-1.0.3.2
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -269,15 +269,15 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.1.2
   TAGS:
-  - mosaicml/composer:0.19.2
+  - mosaicml/composer:0.20.0
   - mosaicml/composer:latest
   TARGET: composer_stage
   TORCHVISION_VERSION: 0.16.2
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
-  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.19.2
+  COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.20.0
   CUDA_VERSION: ''
-  IMAGE_NAME: composer-0-19-2-cpu
+  IMAGE_NAME: composer-0-20-0-cpu
   MOFED_VERSION: 5.5-1.0.3.2
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.10'
@@ -285,7 +285,7 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.1.2
   TAGS:
-  - mosaicml/composer:0.19.2_cpu
+  - mosaicml/composer:0.20.0_cpu
   - mosaicml/composer:latest_cpu
   TARGET: composer_stage
   TORCHVISION_VERSION: 0.16.2
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index cd4b0e0f27..ca378388c6 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -261,7 +261,7 @@ def _main():
     composer_entries = []
 
     # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images
-    composer_versions = ['0.19.2']  # Only build images for the latest composer version
+    composer_versions = ['0.20.0']  # Only build images for the latest composer version
     composer_python_versions = [PRODUCTION_PYTHON_VERSION]  # just build composer against the latest
 
     for product in itertools.product(composer_python_versions, composer_versions, cuda_options):

From 9ecea4f61a80e468a6d4c5cadf94698734faf935 Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Thu, 22 Feb 2024 18:23:54 -0500
Subject: [PATCH 23/28] Update ruff. Fix PLE & LOG lints (#3050)

* Update ruff. Fix and enable PLE lints

* Enable LOG checks too
---
 .pre-commit-config.yaml    | 2 +-
 composer/cli/launcher.py   | 2 +-
 composer/profiler/utils.py | 2 +-
 pyproject.toml             | 5 ++++-
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4f89154571..5f37f27121 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ default_language_version:
 repos:
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
-  rev: v0.0.282
+  rev: v0.2.2
   hooks:
   - id: ruff
     args: [--fix, --exit-non-zero-on-fix]
diff --git a/composer/cli/launcher.py b/composer/cli/launcher.py
index 3dd33aede7..b27181ccc5 100755
--- a/composer/cli/launcher.py
+++ b/composer/cli/launcher.py
@@ -471,7 +471,7 @@ def main():
     args = _parse_args()
 
     logging.basicConfig()
-    log.setLevel(logging.INFO if args.verbose else logging.WARN)
+    log.setLevel(logging.INFO if args.verbose else logging.WARNING)
 
     processes = {}
 
diff --git a/composer/profiler/utils.py b/composer/profiler/utils.py
index b4df8396a7..d9200ccb46 100644
--- a/composer/profiler/utils.py
+++ b/composer/profiler/utils.py
@@ -93,5 +93,5 @@ def export_memory_timeline_html(prof: TorchProfile,
 
         with open(path, 'w') as f:
             f.write(html)
-    log.debug('Memory timeline exported to', path, '.')
+    log.debug('Memory timeline exported to %s.', path)
     remove(tmpfile.name)
diff --git a/pyproject.toml b/pyproject.toml
index 1583440640..a18efde55a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,12 +10,14 @@ line_length = 120
 skip = [ "env", "wandb", "runs", "build", "node_modules" ]
 known_third_party = ["wandb"]
 
-[tool.ruff]
+[tool.ruff.lint]
 select = [
     "C4",
     # TODO port pydocstyle
     # "D", # pydocstyle
+    "LOG",
     "PERF",
+    "PLE",
 ]
 
 ignore = [
@@ -23,6 +25,7 @@ ignore = [
     "PERF2",
     "PERF4",
 ]
+[tool.ruff]
 exclude = [
     "build/**",
     "docs/**",

From d5692e445c37ae98bc3516767cfdb2ed49e93243 Mon Sep 17 00:00:00 2001
From: Jerry Chen <jerry.chen@databricks.com>
Date: Thu, 22 Feb 2024 17:23:22 -0800
Subject: [PATCH 24/28] Rename composer_run_name tag to run_name when logging
 to MLflow (#3040)

* Rename composer_run_name tag to run_name when logging to MLflow

* backwards compatibility
---
 composer/loggers/mlflow_logger.py   | 13 +++++++++++--
 tests/loggers/test_mlflow_logger.py | 24 ++++++++++++++++++++++--
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/composer/loggers/mlflow_logger.py b/composer/loggers/mlflow_logger.py
index 0e13791d64..c8a0be47d0 100644
--- a/composer/loggers/mlflow_logger.py
+++ b/composer/loggers/mlflow_logger.py
@@ -144,7 +144,7 @@ def init(self, state: State, logger: Logger) -> None:
 
         # Store the Composer run name in the MLFlow run tags so it can be retrieved for autoresume.
         self.tags = self.tags or {}
-        self.tags['composer_run_name'] = state.run_name
+        self.tags['run_name'] = state.run_name
 
         # Adjust name and group based on `rank_zero_only`.
         if not self._rank_zero_only:
@@ -162,8 +162,17 @@ def init(self, state: State, logger: Logger) -> None:
                 # Search for an existing run tagged with this Composer run.
                 assert self._experiment_id is not None
                 existing_runs = mlflow.search_runs(experiment_ids=[self._experiment_id],
-                                                   filter_string=f'tags.composer_run_name = "{state.run_name}"',
+                                                   filter_string=f'tags.run_name = "{state.run_name}"',
                                                    output_format='list')
+
+                # Check for the old tag (`composer_run_name`) For backwards compatibility in case a run using the old
+                # tag fails and the run is resumed with a newer version of Composer that uses `run_name` instead of
+                # `composer_run_name`.
+                if len(existing_runs) == 0:
+                    existing_runs = mlflow.search_runs(experiment_ids=[self._experiment_id],
+                                                       filter_string=f'tags.composer_run_name = "{state.run_name}"',
+                                                       output_format='list')
+
                 if len(existing_runs) > 0:
                     self._run_id = existing_runs[0].info.run_id
                 else:
diff --git a/tests/loggers/test_mlflow_logger.py b/tests/loggers/test_mlflow_logger.py
index d5de5b8171..6dd02ab30e 100644
--- a/tests/loggers/test_mlflow_logger.py
+++ b/tests/loggers/test_mlflow_logger.py
@@ -167,7 +167,7 @@ def test_mlflow_experiment_init_experiment_name(monkeypatch):
 
 
 def test_mlflow_experiment_init_existing_composer_run(monkeypatch):
-    """ Test that an existing MLFlow run is used if one already exists in the experiment for the Composer run.
+    """ Test that an existing MLFlow run is used if one tagged with `run_name` exists in the experiment for the Composer run.
     """
     mlflow = pytest.importorskip('mlflow')
 
@@ -186,6 +186,26 @@ def test_mlflow_experiment_init_existing_composer_run(monkeypatch):
     assert test_logger._run_id == existing_id
 
 
+def test_mlflow_experiment_init_existing_composer_run_with_old_tag(monkeypatch):
+    """ Test that an existing MLFlow run is used if one exists with the old `composer_run_name` tag.
+    """
+    mlflow = pytest.importorskip('mlflow')
+
+    monkeypatch.setattr(mlflow, 'set_tracking_uri', MagicMock())
+    monkeypatch.setattr(mlflow, 'start_run', MagicMock())
+
+    mock_state = MagicMock()
+    mock_state.composer_run_name = 'dummy-run-name'
+
+    existing_id = 'dummy-id'
+    mock_search_runs = MagicMock(return_value=[MagicMock(info=MagicMock(run_id=existing_id))])
+    monkeypatch.setattr(mlflow, 'search_runs', mock_search_runs)
+
+    test_logger = MLFlowLogger()
+    test_logger.init(state=mock_state, logger=MagicMock())
+    assert test_logger._run_id == existing_id
+
+
 def test_mlflow_experiment_set_up(tmp_path):
     """ Test that MLFlow experiment is set up correctly within mlflow
     """
@@ -231,7 +251,7 @@ def test_mlflow_experiment_set_up(tmp_path):
     assert actual_run_name == expected_run_name
 
     # Check run tagged with Composer run name.
-    assert tags['composer_run_name'] == mock_state.run_name
+    assert tags['run_name'] == mock_state.run_name
 
     # Check run ended.
     test_mlflow_logger.post_close()

From a042759bbcafc0e069d74407b74bc782b19a5088 Mon Sep 17 00:00:00 2001
From: Vitaliy Chiley <6439018+vchiley@users.noreply.github.com>
Date: Fri, 23 Feb 2024 11:33:15 -0800
Subject: [PATCH 25/28] enable aggregate mem monitoring (#3042)

* enable aggregate mem monitoring

* add test

* lint

* make more deterministic

* pr comments

* Update composer/callbacks/memory_monitor.py

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>

* updt doc str

---------

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 composer/callbacks/memory_monitor.py   | 52 +++++++++++++++++++++++++-
 tests/callbacks/test_memory_monitor.py | 39 +++++++++++++++++++
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/composer/callbacks/memory_monitor.py b/composer/callbacks/memory_monitor.py
index cc9341ef0d..d55326222a 100644
--- a/composer/callbacks/memory_monitor.py
+++ b/composer/callbacks/memory_monitor.py
@@ -7,7 +7,9 @@
 import warnings
 from typing import Dict, Optional, Union
 
+import torch
 import torch.cuda
+from torch import distributed
 
 from composer.core import Callback, State
 from composer.loggers import Logger
@@ -17,6 +19,37 @@
 __all__ = ['MemoryMonitor']
 
 
+def reduce_value(
+    value: Union[int, float],
+    model_device: torch.device,
+    reduce_op: str = 'mean',
+):
+    """Reduce a value across distributed processes.
+
+    Args:
+        value (Union[int, float]): The value to reduce.
+        model_device (torch.device): The device on which the model is located.
+        reduce_op (str, optional): The reduction operation to perform. One of 'mean', 'avg', 'sum', 'min', 'max'.
+            Defaults to 'mean'.
+    """
+    tensor_value = torch.tensor(value, device=model_device)
+
+    if reduce_op in ['mean', 'avg', 'sum']:
+        op = distributed.ReduceOp.SUM
+    elif reduce_op == 'min':
+        op = distributed.ReduceOp.MIN
+    elif reduce_op == 'max':
+        op = distributed.ReduceOp.MAX
+    else:
+        raise ValueError(f'{reduce_op=} not supported.')
+
+    distributed.all_reduce(tensor_value, op=op)
+    if reduce_op in ['mean', 'avg']:
+        tensor_value = tensor_value / distributed.get_world_size()
+
+    return tensor_value.item()
+
+
 class MemoryMonitor(Callback):
     """Logs the memory usage of the model.
 
@@ -73,6 +106,9 @@ class MemoryMonitor(Callback):
     | alloc_retries          | Number of failed cudaMalloc calls that result in a cache flush and retry.                 |
     +------------------------+-------------------------------------------------------------------------------------------+
 
+    Additionally, if `dist_aggregate_batch_interval` is enabled, the `avg`, `min`, and `max` of the
+    aformentioned statistics are also logged.
+
     .. note::
         Memory usage monitoring is only supported for GPU devices.
 
@@ -81,10 +117,17 @@ class MemoryMonitor(Callback):
             are the names of memory statistics to log from `torch.cuda.memory_stats()`, and values
             are the names they will be logged under. If not provided, the above statistics are
             logged. Defaults to None.
+        dist_aggregate_batch_interval (int, optional): interval for aggregating memory stats across
+            all nodes. Defaults to None (by default the functionality is disabled).
     """
 
-    def __init__(self, memory_keys: Optional[Dict[str, str]] = None) -> None:
+    def __init__(
+        self,
+        memory_keys: Optional[Dict[str, str]] = None,
+        dist_aggregate_batch_interval: Optional[int] = None,
+    ) -> None:
         self.memory_keys = memory_keys
+        self.dist_aggregate_batch_interval = dist_aggregate_batch_interval
 
     def init(self, state: State, logger: Logger) -> None:
         # Not relying on `torch.cuda.is_available()` since the model could be on CPU.
@@ -101,6 +144,13 @@ def after_train_batch(self, state: State, logger: Logger):
             return
 
         memory_report = _get_memory_report(self.memory_keys)
+        if self.dist_aggregate_batch_interval is not None and state.timestamp.batch.value % self.dist_aggregate_batch_interval == 0:
+            dist_memory_report = {}
+            for (mem_stat, val) in memory_report.items():
+                dist_memory_report[mem_stat + '_avg'] = reduce_value(val, model_device, 'avg')
+                dist_memory_report[mem_stat + '_min'] = reduce_value(val, model_device, 'min')
+                dist_memory_report[mem_stat + '_max'] = reduce_value(val, model_device, 'max')
+            memory_report.update(dist_memory_report)
 
         logger.log_metrics({f'memory/{mem_stat}': val for (mem_stat, val) in memory_report.items()})
 
diff --git a/tests/callbacks/test_memory_monitor.py b/tests/callbacks/test_memory_monitor.py
index f2badc638c..28a782a58a 100644
--- a/tests/callbacks/test_memory_monitor.py
+++ b/tests/callbacks/test_memory_monitor.py
@@ -2,11 +2,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+import torch
 from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
 
 from composer.callbacks import MemoryMonitor
 from composer.loggers import InMemoryLogger
 from composer.trainer import Trainer
+from composer.utils import dist, get_device
 from tests.common import RandomClassificationDataset, SimpleModel
 
 
@@ -38,3 +41,39 @@ def test_memory_monitor_gpu():
     num_memory_monitor_calls = len(in_memory_logger.data['memory/peak_allocated_mem'])
 
     assert num_memory_monitor_calls == int(trainer.state.timestamp.batch)
+
+
+@pytest.mark.gpu
+@pytest.mark.world_size(2)
+def test_dist_memory_monitor_gpu():
+    dist.initialize_dist(get_device(None))
+
+    # Construct the trainer
+    memory_monitor = MemoryMonitor(dist_aggregate_batch_interval=1)
+    in_memory_logger = InMemoryLogger()
+
+    # Add extra memory useage to rank 1
+    numel = 1 << 30  # about 1B elements in 32 bits is about 4GB = 4 * numel / 1e9
+    if dist.get_local_rank() == 1:
+        _ = torch.randn(numel, device='cuda')
+
+    dataset = RandomClassificationDataset()
+    trainer = Trainer(
+        model=SimpleModel(),
+        callbacks=memory_monitor,
+        loggers=in_memory_logger,
+        train_dataloader=DataLoader(dataset=dataset, sampler=DistributedSampler(dataset=dataset)),
+        max_duration='2ba',
+    )
+    trainer.fit()
+
+    peak_allocated_mem = in_memory_logger.data['memory/peak_allocated_mem'][-1][-1]
+    peak_allocated_mem = round(peak_allocated_mem, 2)
+    peak_allocated_mem_max = in_memory_logger.data['memory/peak_allocated_mem_max'][-1][-1]
+    peak_allocated_mem_max = round(peak_allocated_mem_max, 2)
+
+    if dist.get_local_rank() == 0:
+        assert peak_allocated_mem_max > peak_allocated_mem
+
+    if dist.get_local_rank() == 1:
+        assert peak_allocated_mem_max == peak_allocated_mem

From 1c19e1c08f88a9c13741cbea3546da277d751082 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:17:51 -0500
Subject: [PATCH 26/28] Bump junitparser from 3.1.1 to 3.1.2 (#3056)

Bumps [junitparser](https://github.com/weiwei/junitparser) from 3.1.1 to 3.1.2.
- [Changelog](https://github.com/weiwei/junitparser/blob/master/CHANGELOG.md)
- [Commits](https://github.com/weiwei/junitparser/compare/3.1.1...3.1.2)

---
updated-dependencies:
- dependency-name: junitparser
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index f77ff65036..6c9d915a48 100644
--- a/setup.py
+++ b/setup.py
@@ -99,7 +99,7 @@ def package_files(prefix: str, directory: str, extension: str):
     # Pinning versions strictly to avoid random test failures.
     # Should manually update dependency versions occassionally.
     'custom_inherit==2.4.1',
-    'junitparser==3.1.1',
+    'junitparser==3.1.2',
     'coverage[toml]==7.4.1',
     'fasteners==0.18',  # object store tests require fasteners
     'pytest==7.4.4',

From 1c52d47626fe7c85edf22341a566fd1b73defa0a Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 26 Feb 2024 16:13:55 -0500
Subject: [PATCH 27/28] Add SHARD_GRAD_OP to device mesh error check (#3058)

* fix tests

* fi xerror

* fix
---
 composer/trainer/dist_strategy.py |  4 +++-
 tests/trainer/test_fsdp.py        | 16 +++++++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/composer/trainer/dist_strategy.py b/composer/trainer/dist_strategy.py
index a0e9a38656..6f9010bebe 100644
--- a/composer/trainer/dist_strategy.py
+++ b/composer/trainer/dist_strategy.py
@@ -303,7 +303,9 @@ def sync_hook(*args):
     if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.2.0'):
         if 'device_mesh' in fsdp_config:
             device_mesh_size = len(fsdp_config['device_mesh'])
-            if sharding_strategy in [ShardingStrategy.FULL_SHARD, ShardingStrategy.NO_SHARD] and device_mesh_size != 1:
+            if sharding_strategy in [
+                    ShardingStrategy.FULL_SHARD, ShardingStrategy.SHARD_GRAD_OP, ShardingStrategy.NO_SHARD
+            ] and device_mesh_size != 1:
                 raise ValueError(f'FSDP sharding strategy {sharding_map_key.upper()} requires a device mesh '
                                  f'of size 1 but got device mesh size of {device_mesh_size}.')
             elif sharding_strategy in [ShardingStrategy.HYBRID_SHARD, ShardingStrategy._HYBRID_SHARD_ZERO2
diff --git a/tests/trainer/test_fsdp.py b/tests/trainer/test_fsdp.py
index 76c59c0cc1..dfa4c1f3ee 100644
--- a/tests/trainer/test_fsdp.py
+++ b/tests/trainer/test_fsdp.py
@@ -1,6 +1,7 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+import contextlib
 from unittest.mock import MagicMock
 
 import pytest
@@ -219,10 +220,19 @@ def test_fsdp_process_group(world_size: int):
 @pytest.mark.gpu
 @world_size(2)
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('2.2.0'), reason='Device mesh requires Torch 2.2')
-def test_wrong_size_device_mesh_error(world_size: int):
-    with pytest.raises(ValueError, match='.*requires a device mesh of size 1.*'):
+@pytest.mark.parametrize('sharding_strategy',
+                         ['NO_SHARD', 'SHARD_GRAD_OP', 'FULL_SHARD', 'HYBRID_SHARD', '_HYBRID_SHARD_ZERO2'])
+@pytest.mark.parametrize('device_mesh', [[2], [1, 2]])
+def test_wrong_size_device_mesh_error(world_size: int, sharding_strategy: str, device_mesh: list[int]):
+    context = contextlib.nullcontext()
+    if sharding_strategy in ['NO_SHARD', 'SHARD_GRAD_OP', 'FULL_SHARD'] and len(device_mesh) != 1:
+        context = pytest.raises(ValueError, match='.*requires a device mesh of size 1.*')
+    if sharding_strategy in ['HYBRID_SHARD', '_HYBRID_SHARD_ZERO2'] and len(device_mesh) != 2:
+        context = pytest.raises(ValueError, match='.*requires a device mesh of size 2.*')
+    with context:
         Trainer(model=SimpleModel(), fsdp_config={
-            'device_mesh': [1, 2],
+            'sharding_strategy': sharding_strategy,
+            'device_mesh': device_mesh,
         })
 
 

From 0814e01698ed9c244d6af30211bcfce3deeeed3d Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Tue, 27 Feb 2024 10:06:47 -0500
Subject: [PATCH 28/28] Add torch 2.2.1 support  (#3059)

* 2.2 is patch free

* docker

* fix torchvision

* run generate

* remove apex
---
 composer/trainer/mosaic_fsdp.py |  4 ++++
 docker/Dockerfile               | 20 --------------------
 docker/README.md                |  6 +++---
 docker/build_matrix.yaml        | 24 ++++++++++++------------
 docker/generate_build_matrix.py |  8 ++++----
 tests/test_passes.py            |  4 ----
 6 files changed, 23 insertions(+), 43 deletions(-)

diff --git a/composer/trainer/mosaic_fsdp.py b/composer/trainer/mosaic_fsdp.py
index 07a4f15fbf..51bf891491 100644
--- a/composer/trainer/mosaic_fsdp.py
+++ b/composer/trainer/mosaic_fsdp.py
@@ -61,6 +61,10 @@ def patch_pytorch():
         from torch.distributed.fsdp import _runtime_utils
         _runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None
 
+    elif version.parse(torch.__version__) < version.parse('2.2.2'):
+        # Monkey patch for torch < 2.2.2 ie torch == 2.2.1
+        pass
+
     elif version.parse(torch.__version__) < version.parse('2.3.1'):
         # Monkey patch for torch < 2.3.1 ie torch == 2.3.0
         # Note: this is the same patch as 2.2.0, we are just making a new if branch
diff --git a/docker/Dockerfile b/docker/Dockerfile
index e5ae9b9468..7c9735e13f 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -270,26 +270,6 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \
         rm -rf /tmp/mofed ; \
     fi
 
-
-#####################
-# Install NVIDIA Apex
-#####################
-# skip if torch nightly is installed as there is incompatability
-RUN if [[ -n "$CUDA_VERSION" ]] &&  [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \
-        mkdir -p /tmp/apex && \
-        cd /tmp/apex && \
-        git clone https://github.com/NVIDIA/apex && \
-        cd apex && \
-        git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && \
-        pip${PYTHON_VERSION} install --no-cache-dir -r requirements.txt && \
-        pip${PYTHON_VERSION} install --no-cache-dir \
-            --global-option="--cpp_ext" \
-            --global-option="--cuda_ext" \
-            --target  /usr/local/lib/python${PYTHON_VERSION}/dist-packages \
-            ./ && \
-        rm -rf /tmp/apex ; \
-    fi
-
 ##########################
 # Install Flash Attention
 ##########################
diff --git a/docker/README.md b/docker/README.md
index d0624e2665..1491e162f2 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -32,9 +32,9 @@ To install composer, once inside the image, run `pip install mosaicml`.
 |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------|
 | Ubuntu 20.04   | Base     | 2.3.0             | 12.1.0 (Infiniband) | 3.11             | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04`                    |
 | Ubuntu 20.04   | Base     | 2.3.0             | 12.1.0 (EFA)        | 3.11             | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04-aws`                |
-| Ubuntu 20.04   | Base     | 2.2.0             | 12.1.0 (Infiniband) | 3.11             | `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04`                                    |
-| Ubuntu 20.04   | Base     | 2.2.0             | 12.1.0 (EFA)        | 3.11             | `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04-aws`                                |
-| Ubuntu 20.04   | Base     | 2.2.0             | cpu                 | 3.11             | `mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04`                                      |
+| Ubuntu 20.04   | Base     | 2.2.1             | 12.1.0 (Infiniband) | 3.11             | `mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04`                                    |
+| Ubuntu 20.04   | Base     | 2.2.1             | 12.1.0 (EFA)        | 3.11             | `mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04-aws`                                |
+| Ubuntu 20.04   | Base     | 2.2.1             | cpu                 | 3.11             | `mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04`                                      |
 | Ubuntu 20.04   | Base     | 2.1.2             | 12.1.0 (Infiniband) | 3.10             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04`         |
 | Ubuntu 20.04   | Base     | 2.1.2             | 12.1.0 (EFA)        | 3.10             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` |
 | Ubuntu 20.04   | Base     | 2.1.2             | cpu                 | 3.10             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04`       |
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index 21c36347e9..fb548db8bb 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -2,7 +2,7 @@
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.0
-  IMAGE_NAME: torch-2-2-0-cu121
+  IMAGE_NAME: torch-2-2-1-cu121
   MOFED_VERSION: 5.5-1.0.3.2
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -21,15 +21,15 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.2.0
+  PYTORCH_VERSION: 2.2.1
   TAGS:
-  - mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.17.0
+  TORCHVISION_VERSION: 0.17.1
 - AWS_OFI_NCCL_VERSION: v1.7.4-aws
   BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.0
-  IMAGE_NAME: torch-2-2-0-cu121-aws
+  IMAGE_NAME: torch-2-2-1-cu121-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -48,25 +48,25 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.2.0
+  PYTORCH_VERSION: 2.2.1
   TAGS:
-  - mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.17.0
+  TORCHVISION_VERSION: 0.17.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-2-0-cpu
+  IMAGE_NAME: torch-2-2-1-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.2.0
+  PYTORCH_VERSION: 2.2.1
   TAGS:
-  - mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.17.0
+  TORCHVISION_VERSION: 0.17.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.0
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index ca378388c6..215acef5b8 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -23,8 +23,8 @@
 
 
 def _get_torchvision_version(pytorch_version: str):
-    if pytorch_version == '2.2.0':
-        return '0.17.0'
+    if pytorch_version == '2.2.1':
+        return '0.17.1'
     if pytorch_version == '2.1.2':
         return '0.16.2'
     if pytorch_version == '2.0.1':
@@ -42,7 +42,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool):
     # From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/
     if not use_cuda:
         return ''
-    if pytorch_version == '2.2.0':
+    if pytorch_version == '2.2.1':
         return '12.1.0'
     if pytorch_version == '2.1.2':
         return '12.1.0'
@@ -163,7 +163,7 @@ def _write_table(table_tag: str, table_contents: str):
 
 
 def _main():
-    python_pytorch_versions = [('3.11', '2.2.0'), ('3.10', '2.1.2'), ('3.10', '2.0.1')]
+    python_pytorch_versions = [('3.11', '2.2.1'), ('3.10', '2.1.2'), ('3.10', '2.0.1')]
     cuda_options = [True, False]
     stages = ['pytorch_stage']
     interconnects = ['mellanox', 'EFA']  # mellanox is default, EFA needed for AWS
diff --git a/tests/test_passes.py b/tests/test_passes.py
index 19e5dc0843..3f3a99dee6 100644
--- a/tests/test_passes.py
+++ b/tests/test_passes.py
@@ -84,10 +84,6 @@ class TestAlgorithmOrderingPasses:
     @pytest.mark.parametrize('algorithm_cls', [LowPrecisionLayerNorm])
     def test_algorithm_last(self, algorithm_cls: Type[Algorithm], always_match_algorithms: List[Algorithm],
                             dummy_logger: Logger, dummy_state: State):
-
-        if algorithm_cls == LowPrecisionLayerNorm:
-            pytest.importorskip('apex')
-
         algorithm = algorithm_cls()
         algorithm.apply = Mock(return_value='algo')
         algorithm.match = Mock(return_value=True)