From 19368c66e9b01d32a97d81548ebbb42b107c4970 Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Fri, 19 Jan 2024 15:34:18 -0500
Subject: [PATCH 01/16] Update license (#887)

Updates the license for 2024. New files will have a copyright year of 2024 inserted in the header. Existing files will not be changed.
---
 .ci/FILE_HEADER         | 2 +-
 .pre-commit-config.yaml | 3 ++-
 setup.py                | 3 +++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.ci/FILE_HEADER b/.ci/FILE_HEADER
index 22198520fd..e6d99f5d6f 100644
--- a/.ci/FILE_HEADER
+++ b/.ci/FILE_HEADER
@@ -1,2 +1,2 @@
-Copyright 2022 MosaicML LLM Foundry authors
+Copyright 2024 MosaicML LLM Foundry authors
 SPDX-License-Identifier: Apache-2.0
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d4c8cc699c..4a5bf1f6bb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -57,7 +57,7 @@ repos:
   - id: mixed-line-ending
   - id: trailing-whitespace
 - repo: https://github.com/Lucas-C/pre-commit-hooks
-  rev: v1.3.1
+  rev: v1.5.4
   hooks:
   - id: insert-license
     args:
@@ -65,6 +65,7 @@ repos:
     - .ci/FILE_HEADER
     - --comment-style
     - '#'
+    - --allow-past-years
     types: [python]
 - repo: https://github.com/PyCQA/docformatter
   rev: v1.5.0
diff --git a/setup.py b/setup.py
index 2c4a05f396..2a2b5cc9cd 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,6 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 """MosaicML LLM Foundry package setup."""
 
 import os

From c9a49d0b3db97263f2df2d79f51a386e3e2b69a7 Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Fri, 19 Jan 2024 18:45:22 -0800
Subject: [PATCH 02/16] Fix tiktoken add generation prompt (#890)

---
 llmfoundry/tokenizers/tiktoken.py |  2 +-
 tests/tokenizers/test_tiktoken.py | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
index 2632985533..eaaf0da316 100644
--- a/llmfoundry/tokenizers/tiktoken.py
+++ b/llmfoundry/tokenizers/tiktoken.py
@@ -198,7 +198,7 @@ def default_chat_template(self):
             '{% else %}'
             "{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
             '{% endif %}'
-            '{% if (add_generation_prompt == true) %}'
+            '{% if (add_generation_prompt == true and loop.last) %}'
             "{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
             "{% elif (message['role'] == 'assistant') %}"
             '{{ eos_token }}'
diff --git a/tests/tokenizers/test_tiktoken.py b/tests/tokenizers/test_tiktoken.py
index 6a4d1c99c4..aca269af82 100644
--- a/tests/tokenizers/test_tiktoken.py
+++ b/tests/tokenizers/test_tiktoken.py
@@ -108,6 +108,12 @@
         'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.',
     'role':
         'user'
+}, {
+    'content': 'You should go outside and touch grass.',
+    'role': 'assistant'
+}, {
+    'content': 'What else can I do?',
+    'role': 'user'
 }]]
 
 MULTI_TURN_GENERATE_STRING = [
@@ -118,6 +124,10 @@
 
 Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
 <|im_start|>assistant
+You should go outside and touch grass.<|im_end|><|endoftext|>
+<|im_start|>user
+What else can I do?<|im_end|>
+<|im_start|>assistant
 """
 ]
 

From b2a0c03bc2684912f3fde0dbb798e62dc70607fd Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Sun, 21 Jan 2024 17:34:52 -0800
Subject: [PATCH 03/16] Upgrade Datasets version (#892)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2a2b5cc9cd..8c43a309c3 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@
     'transformers>=4.36,<4.37',
     'mosaicml-streaming>=0.7.2,<0.8',
     'torch>=2.1,<2.1.1',
-    'datasets==2.15.0',
+    'datasets>=2.16,<2.17',
     'fsspec==2023.6.0',  # newer version results in a bug in datasets that duplicates data
     'sentencepiece==0.1.97',
     'einops==0.7.0',

From 02c44ad6837a5aa61f9268f322b8ca19b8917dde Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 22 Jan 2024 17:43:58 -0800
Subject: [PATCH 04/16] Bump transformers version to support Mixtral (#894)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8c43a309c3..511e665ed4 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,7 @@
 install_requires = [
     'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.17.2,<0.18',
     'accelerate>=0.25,<0.26',  # for HF inference `device_map`
-    'transformers>=4.36,<4.37',
+    'transformers>=4.37,<4.38',
     'mosaicml-streaming>=0.7.2,<0.8',
     'torch>=2.1,<2.1.1',
     'datasets>=2.16,<2.17',

From 07d6db36b5399cec77dba4d9aa31a153105b348b Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Mon, 22 Jan 2024 19:16:08 -0800
Subject: [PATCH 05/16] Add `tokenizer-only` flag to only download tokenizers
 from HF or oras (#895)

---
 llmfoundry/utils/model_download_utils.py | 26 ++++++++++++++++++------
 scripts/misc/download_model.py           | 20 ++++++++++++++----
 tests/utils/test_model_download_utils.py |  1 +
 3 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py
index 5d8a413d91..07c84a85c8 100644
--- a/llmfoundry/utils/model_download_utils.py
+++ b/llmfoundry/utils/model_download_utils.py
@@ -30,6 +30,12 @@
 ]
 PYTORCH_WEIGHTS_PATTERN = 'pytorch_model*.bin*'
 SAFE_WEIGHTS_PATTERN = 'model*.safetensors*'
+TOKENIZER_FILES = [
+    'special_tokens_map.json',
+    'tokenizer.json',
+    'tokenizer.model',
+    'tokenizer_config.json',
+]
 
 ORAS_PASSWD_PLACEHOLDER = '<placeholder_for_passwd>'
 ORAS_CLI = 'oras'
@@ -45,6 +51,7 @@ def download_from_hf_hub(
     model: str,
     save_dir: str,
     prefer_safetensors: bool = True,
+    tokenizer_only: bool = False,
     token: Optional[str] = None,
 ):
     """Downloads model files from a Hugging Face Hub model repo.
@@ -57,6 +64,7 @@ def download_from_hf_hub(
         save_dir (str, optional): The local path to the directory where the model files will be downloaded.
         prefer_safetensors (bool): Whether to prefer Safetensors weights over PyTorch weights if both are
             available. Defaults to True.
+        tokenizer_only (bool): If true, only download tokenizer files.
         token (str, optional): The HuggingFace API token. If not provided, the token will be read from the
             `HUGGING_FACE_HUB_TOKEN` environment variable.
 
@@ -95,10 +103,13 @@ def download_from_hf_hub(
             ' Please make sure the repo contains either safetensors or pytorch weights.'
         )
 
+    allow_patterns = TOKENIZER_FILES if tokenizer_only else None
+
     download_start = time.time()
     hf_hub.snapshot_download(model,
                              local_dir=save_dir,
                              ignore_patterns=ignore_patterns,
+                             allow_patterns=allow_patterns,
                              token=token)
     download_duration = time.time() - download_start
     log.info(
@@ -221,16 +232,18 @@ def download_from_oras(model: str,
                        config_file: str,
                        credentials_dir: str,
                        save_dir: str,
+                       tokenizer_only: bool = False,
                        concurrency: int = 10):
     """Download from an OCI-compliant registry using oras.
 
     Args:
-        model: The name of the model to download.
-        config_file: Path to a YAML config file that maps model names to registry paths.
-        credentials_dir: Path to a directory containing credentials for the registry. It is expected to contain three
+        model (str): The name of the model to download.
+        config_file (str): Path to a YAML config file that maps model and tokenizer names to registry paths.
+        credentials_dir (str): Path to a directory containing credentials for the registry. It is expected to contain three
             files: `username`, `password`, and `registry`, each of which contains the corresponding credential.
-        save_dir: Path to the directory where files will be downloaded.
-        concurrency: The number of concurrent downloads to run.
+        save_dir (str): Path to the directory where files will be downloaded.
+        tokenizer_only (bool): If true, only download the tokenzier files.
+        concurrency (int): The number of concurrent downloads to run.
     """
     if shutil.which(ORAS_CLI) is None:
         raise Exception(
@@ -253,7 +266,8 @@ def _read_secrets_file(secret_file_path: str,):
     with open(config_file, 'r', encoding='utf-8') as f:
         configs = yaml.safe_load(f.read())
 
-    path = configs['models'][model]
+    config_type = 'tokenizers' if tokenizer_only else 'models'
+    path = configs[config_type][model]
     registry = secrets['registry']
 
     def get_oras_cmd(username: Optional[str] = None,
diff --git a/scripts/misc/download_model.py b/scripts/misc/download_model.py
index 1913267e20..13a63ce55e 100644
--- a/scripts/misc/download_model.py
+++ b/scripts/misc/download_model.py
@@ -7,10 +7,11 @@
     python download_model.py hf --model mosaicml/mpt-7b --save-dir <save_dir> --token <token>
 
 Download from ORAS registry:
-    python download_model.py oras --registry <registry> --path mosaicml/mpt-7b --save-dir <save_dir>
+    python download_model.py oras --model mosaicml/mpt-7b --config-file <config_file> \
+        --credentials-dir <credentials_dir> --save-dir <save_dir>
 
 Download from an HTTP file server:
-    python download_model.py http --host https://server.com --path mosaicml/mpt-7b --save-dir <save_dir>
+    python download_model.py http --url https://server.com/models/mosaicml/mpt-7b/ --save-dir <save_dir>
 
 Download from an HTTP file server with fallback to Hugging Face Hub:
     python download_model.py http --host https://server.com --path mosaicml/mpt-7b --save-dir <save_dir> \
@@ -56,6 +57,9 @@ def parse_args() -> argparse.Namespace:
 
     base_parser = argparse.ArgumentParser(add_help=False)
     base_parser.add_argument('--save-dir', type=str, required=True)
+    base_parser.add_argument('--tokenizer-only',
+                             default=False,
+                             action='store_true')
 
     # Add subparser for downloading from Hugging Face Hub.
     hf_parser = subparsers.add_parser('hf', parents=[base_parser])
@@ -85,6 +89,9 @@ def parse_args() -> argparse.Namespace:
     download_from = args.download_from
 
     if download_from == 'http':
+        if args.tokenizer_only:
+            raise ValueError(
+                'tokenizer-only is not currently supported for http.')
         try:
             download_from_http_fileserver(args.url, args.save_dir,
                                           args.ignore_cert)
@@ -109,7 +116,12 @@ def parse_args() -> argparse.Namespace:
         download_from_hf_hub(args.model,
                              save_dir=args.save_dir,
                              token=args.token,
+                             tokenizer_only=args.tokenizer_only,
                              prefer_safetensors=args.prefer_safetensors)
     elif download_from == 'oras':
-        download_from_oras(args.model, args.config_file, args.credentials_dir,
-                           args.save_dir, args.concurrency)
+        download_from_oras(args.model,
+                           args.config_file,
+                           args.credentials_dir,
+                           args.save_dir,
+                           tokenizer_only=args.tokenizer_only,
+                           concurrency=args.concurrency)
diff --git a/tests/utils/test_model_download_utils.py b/tests/utils/test_model_download_utils.py
index 471a39dcdb..14749bdcd9 100644
--- a/tests/utils/test_model_download_utils.py
+++ b/tests/utils/test_model_download_utils.py
@@ -110,6 +110,7 @@ def test_download_from_hf_hub_weights_pref(mock_list_repo_files: MagicMock,
     mock_snapshot_download.assert_called_once_with(
         test_repo_id,
         local_dir=save_dir,
+        allow_patterns=None,
         ignore_patterns=expected_ignore_patterns,
         token=None)
 

From f2614a4a609e1575073c02422077ae4e89b82549 Mon Sep 17 00:00:00 2001
From: Anna <anna@mosaicml.com>
Date: Mon, 22 Jan 2024 21:29:23 -0800
Subject: [PATCH 06/16] Foundational Model API eval wrapper (#849)

* FMAPI model wrapper

* add chat wrapper too

* revert

* end line

* formatting

* less verbose

* better error messages
---
 .../models/inference_api_wrapper/__init__.py  |  4 ++
 .../models/inference_api_wrapper/fmapi.py     | 72 +++++++++++++++++++
 .../inference_api_wrapper/openai_causal_lm.py | 27 +++++--
 llmfoundry/models/model_registry.py           |  8 ++-
 4 files changed, 104 insertions(+), 7 deletions(-)
 create mode 100644 llmfoundry/models/inference_api_wrapper/fmapi.py

diff --git a/llmfoundry/models/inference_api_wrapper/__init__.py b/llmfoundry/models/inference_api_wrapper/__init__.py
index 496abf2aa6..9bb2ece2b2 100644
--- a/llmfoundry/models/inference_api_wrapper/__init__.py
+++ b/llmfoundry/models/inference_api_wrapper/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+from llmfoundry.models.inference_api_wrapper.fmapi import (
+    FMAPICasualLMEvalWrapper, FMAPIChatAPIEvalWrapper)
 from llmfoundry.models.inference_api_wrapper.interface import \
     InferenceAPIEvalWrapper
 from llmfoundry.models.inference_api_wrapper.openai_causal_lm import (
@@ -10,4 +12,6 @@
     'OpenAICausalLMEvalWrapper',
     'OpenAIChatAPIEvalWrapper',
     'InferenceAPIEvalWrapper',
+    'FMAPICasualLMEvalWrapper',
+    'FMAPIChatAPIEvalWrapper',
 ]
diff --git a/llmfoundry/models/inference_api_wrapper/fmapi.py b/llmfoundry/models/inference_api_wrapper/fmapi.py
new file mode 100644
index 0000000000..867b3c272e
--- /dev/null
+++ b/llmfoundry/models/inference_api_wrapper/fmapi.py
@@ -0,0 +1,72 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import os
+import time
+from typing import Dict
+
+import requests
+from transformers import AutoTokenizer
+
+from llmfoundry.models.inference_api_wrapper.openai_causal_lm import (
+    OpenAICausalLMEvalWrapper, OpenAIChatAPIEvalWrapper, OpenAIEvalInterface)
+
+__all__ = [
+    'FMAPICasualLMEvalWrapper',
+    'FMAPIChatAPIEvalWrapper',
+]
+
+log = logging.getLogger(__name__)
+
+
+def block_until_ready(base_url: str):
+    """Block until the endpoint is ready."""
+    sleep_s = 5
+    timout_s = 5 * 60  # At max, wait 5 minutes
+
+    ping_url = f'{base_url}/ping'
+
+    waited_s = 0
+    while True:
+        try:
+            requests.get(ping_url)
+            log.info(f'Endpoint {ping_url} is ready')
+            break
+        except requests.exceptions.ConnectionError:
+            log.debug(
+                f'Endpoint {ping_url} not ready yet. Sleeping {sleep_s} seconds'
+            )
+            time.sleep(sleep_s)
+            waited_s += sleep_s
+
+        if waited_s >= timout_s:
+            raise TimeoutError(
+                f'Endpoint {ping_url} did not become read after {waited_s:,} seconds, exiting'
+            )
+
+
+class FMAPIEvalInterface(OpenAIEvalInterface):
+
+    def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer):
+        is_local = model_cfg.pop('local', False)
+        if is_local:
+            base_url = os.environ.get('MOSAICML_MODEL_ENDPOINT',
+                                      'http://0.0.0.0:8080/v2')
+            model_cfg['base_url'] = base_url
+            block_until_ready(base_url)
+
+        if 'base_url' not in model_cfg:
+            raise ValueError(
+                'Must specify base_url or use local=True in model_cfg for FMAPIsEvalWrapper'
+            )
+
+        super().__init__(model_cfg, tokenizer)
+
+
+class FMAPICasualLMEvalWrapper(FMAPIEvalInterface, OpenAICausalLMEvalWrapper):
+    """Databricks Foundational Model API wrapper for causal LM models."""
+
+
+class FMAPIChatAPIEvalWrapper(FMAPIEvalInterface, OpenAIChatAPIEvalWrapper):
+    """Databricks Foundational Model API wrapper for chat models."""
diff --git a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py
index 39de2ba59c..587dd179bd 100644
--- a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py
+++ b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py
@@ -36,9 +36,6 @@ class OpenAIEvalInterface(InferenceAPIEvalWrapper):
 
     def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None:
         super().__init__(model_cfg, tokenizer)
-        assert os.getenv(
-            'OPENAI_API_KEY'
-        ) is not None, 'No OpenAI API Key found. Ensure it is saved as an environmental variable called OPENAI_API_KEY.'
         try:
             import openai
         except ImportError as e:
@@ -46,8 +43,28 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None:
                 extra_deps_group='openai',
                 conda_package='openai',
                 conda_channel='conda-forge') from e
-        self.client = openai.OpenAI()
-        self.model_name = model_cfg['version']
+
+        api_key = os.environ.get('OPENAI_API_KEY')
+        base_url = model_cfg.get('base_url')
+        if base_url is None:
+            # Using OpenAI default, where the API key is required
+            if api_key is None:
+                raise ValueError(
+                    'No OpenAI API Key found. Ensure it is saved as an environmental variable called OPENAI_API_KEY.'
+                )
+
+        else:
+            # Using a custom base URL, where the API key may not be required
+            log.info(
+                f'Making request to custom base URL: {base_url}{"" if api_key is not None else " (no API key set)"}'
+            )
+            api_key = 'placeholder'  # This cannot be None
+
+        self.client = openai.OpenAI(base_url=base_url, api_key=api_key)
+        if 'version' in model_cfg:
+            self.model_name = model_cfg['version']
+        else:
+            self.model_name = model_cfg['name']
 
     def generate_completion(self, prompt: str, num_tokens: int):
         raise NotImplementedError()
diff --git a/llmfoundry/models/model_registry.py b/llmfoundry/models/model_registry.py
index be09a69835..ff9942f5f6 100644
--- a/llmfoundry/models/model_registry.py
+++ b/llmfoundry/models/model_registry.py
@@ -3,7 +3,9 @@
 
 from llmfoundry.models.hf import (ComposerHFCausalLM, ComposerHFPrefixLM,
                                   ComposerHFT5)
-from llmfoundry.models.inference_api_wrapper import (OpenAICausalLMEvalWrapper,
+from llmfoundry.models.inference_api_wrapper import (FMAPICasualLMEvalWrapper,
+                                                     FMAPIChatAPIEvalWrapper,
+                                                     OpenAICausalLMEvalWrapper,
                                                      OpenAIChatAPIEvalWrapper)
 from llmfoundry.models.mpt import ComposerMPTCausalLM
 
@@ -13,5 +15,7 @@
     'hf_prefix_lm': ComposerHFPrefixLM,
     'hf_t5': ComposerHFT5,
     'openai_causal_lm': OpenAICausalLMEvalWrapper,
-    'openai_chat': OpenAIChatAPIEvalWrapper
+    'fmapi_causal_lm': FMAPICasualLMEvalWrapper,
+    'openai_chat': OpenAIChatAPIEvalWrapper,
+    'fmapi_chat': FMAPIChatAPIEvalWrapper,
 }

From 36fcb5e59db73eb09bfcc93087ef12fb6ecc8975 Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Tue, 23 Jan 2024 11:25:49 -0800
Subject: [PATCH 07/16] Add better error for non-empty local output folder in
 convert_text_to_mds.py (#891)

---
 scripts/data_prep/convert_text_to_mds.py      |  4 +
 .../data_prep/test_convert_text_to_mds.py     | 82 ++++++++++---------
 2 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index 2218e575b2..bfd60b8ee1 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -385,6 +385,10 @@ def convert_text_to_mds(
     local_output_folder = tempfile.TemporaryDirectory(
     ).name if is_remote_output else output_folder
 
+    if os.path.isdir(output_folder) and len(os.listdir(output_folder)) > 0:
+        raise FileExistsError(
+            f'{output_folder=} is not empty. Please remove or empty it.')
+
     if processes > 1:
         # Download and convert the text files in parallel
         args = get_task_args(object_names, local_output_folder, input_folder,
diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
index cc293a2cdd..3a00a8889f 100644
--- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py
+++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
@@ -3,6 +3,7 @@
 
 import os
 import pathlib
+import shutil
 from concurrent.futures import ProcessPoolExecutor
 from glob import glob
 from typing import Callable, Iterable, List
@@ -55,23 +56,6 @@ def upload_object(self, object_name: str, filename: str):
             remote_file.write(local_file.read())
 
 
-def _call_convert_text_to_mds(processes: int, tokenizer_name: str,
-                              concat_tokens: int) -> None:
-    convert_text_to_mds(
-        tokenizer_name=tokenizer_name,
-        output_folder=f's3://fake-test-output-path',
-        input_folder=f's3://fake-test-input-path',
-        concat_tokens=concat_tokens,
-        eos_text='',
-        bos_text='',
-        no_wrap=False,
-        compression='zstd',
-        processes=processes,
-        args_str='Namespace()',
-        reprocess=False,
-    )
-
-
 # Mock starmap with no multiprocessing
 def _mock_map(func: Callable, args: Iterable) -> Iterable:
     for arg in args:
@@ -107,9 +91,22 @@ def test_single_and_multi_process(merge_shard_groups: Mock,
     maybe_create_object_store_from_uri.return_value = mock_object_store
     parse_uri.return_value = ('s3', 'fake-test-bucket', str(remote_folder))
 
-    _call_convert_text_to_mds(processes=processes,
-                              tokenizer_name=tokenizer_name,
-                              concat_tokens=concat_tokens)
+    def call_convert_text_to_mds() -> None:
+        convert_text_to_mds(
+            tokenizer_name=tokenizer_name,
+            output_folder=f's3://fake-test-output-path',
+            input_folder=f's3://fake-test-input-path',
+            concat_tokens=concat_tokens,
+            eos_text='',
+            bos_text='',
+            no_wrap=False,
+            compression='zstd',
+            processes=processes,
+            args_str='Namespace()',
+            reprocess=False,
+        )
+
+    call_convert_text_to_mds()
 
     # Check call counts
     assert download_and_convert.call_count == processes  # called once per process
@@ -131,9 +128,7 @@ def test_single_and_multi_process(merge_shard_groups: Mock,
     _assert_files_exist(prefix=remote_folder,
                         files=['index.json', DONE_FILENAME] + shards)
 
-    _call_convert_text_to_mds(processes=processes,
-                              tokenizer_name=tokenizer_name,
-                              concat_tokens=concat_tokens)
+    call_convert_text_to_mds()
 
     # Check call counts
     assert download_and_convert.call_count == processes  # No changes because we shoudn't reprocess
@@ -146,9 +141,7 @@ def test_single_and_multi_process(merge_shard_groups: Mock,
     mock_object_store = Mock(wraps=object_store)
     maybe_create_object_store_from_uri.return_value = mock_object_store
 
-    _call_convert_text_to_mds(processes=processes,
-                              tokenizer_name=tokenizer_name,
-                              concat_tokens=concat_tokens)
+    call_convert_text_to_mds()
 
     # Check call counts
     assert download_and_convert.call_count == processes * 2  # called once per process
@@ -187,31 +180,42 @@ def test_local_path(tmp_path: pathlib.Path):
     input_folder = tmp_path / 'input'
     output_folder = tmp_path / 'output'
 
+    def call_convert_text_to_mds(reprocess: bool):
+        convert_text_to_mds(
+            tokenizer_name='mosaicml/mpt-7b',
+            output_folder=str(output_folder),
+            input_folder=str(input_folder),
+            concat_tokens=1,
+            eos_text='',
+            bos_text='',
+            no_wrap=False,
+            compression='zstd',
+            processes=1,
+            args_str='Namespace()',
+            reprocess=reprocess,
+        )
+
     # Create input text data
     os.makedirs(input_folder, exist_ok=True)
     with open(input_folder / 'test.txt', 'w') as f:
         f.write('test')
 
     # Convert text data to mds
-    convert_text_to_mds(
-        tokenizer_name='mosaicml/mpt-7b',
-        output_folder=str(output_folder),
-        input_folder=str(input_folder),
-        concat_tokens=1,
-        eos_text='',
-        bos_text='',
-        no_wrap=False,
-        compression='zstd',
-        processes=1,
-        args_str='Namespace()',
-        reprocess=False,
-    )
+    call_convert_text_to_mds(reprocess=False)
 
     # Make sure all the files exist as expected.
     assert os.path.exists(output_folder / '.text_to_mds_conversion_done')
     assert os.path.exists(output_folder / 'index.json')
     assert os.path.exists(output_folder / 'shard.00000.mds.zstd')
 
+    # Test reprocessing.
+    with pytest.raises(FileExistsError):
+        call_convert_text_to_mds(reprocess=True)
+
+    shutil.rmtree(output_folder)
+
+    call_convert_text_to_mds(reprocess=True)
+
 
 def test_is_already_processed(tmp_path: pathlib.Path):
     tmp_path_str = str(tmp_path)

From 4961436ff199837cdee223a1d47e7d0d258fb4fd Mon Sep 17 00:00:00 2001
From: Nicholas Garcia <nicholasgcgarcia@gmail.com>
Date: Tue, 23 Jan 2024 14:14:46 -0800
Subject: [PATCH 08/16] Allow bool input for loggers (#897)

* Allow bool input for loggers

* Convert earlier on

* Fix test case
---
 llmfoundry/utils/builders.py | 15 +++++----------
 scripts/train/train.py       |  3 ++-
 tests/utils/test_builders.py |  4 ++--
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 75438b895e..29642381f8 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -219,21 +219,16 @@ def build_callback(
 
 
 def build_logger(name: str, kwargs: Dict[str, Any]) -> LoggerDestination:
-    kwargs_dict = {
-        k: v if isinstance(v, str) else om.to_container(v, resolve=True)
-        for k, v in kwargs.items()
-    }
-
     if name == 'wandb':
-        return WandBLogger(**kwargs_dict)
+        return WandBLogger(**kwargs)
     elif name == 'tensorboard':
-        return TensorboardLogger(**kwargs_dict)
+        return TensorboardLogger(**kwargs)
     elif name == 'in_memory_logger':
-        return InMemoryLogger(**kwargs_dict)
+        return InMemoryLogger(**kwargs)
     elif name == 'mlflow':
-        return MLFlowLogger(**kwargs_dict)
+        return MLFlowLogger(**kwargs)
     elif name == 'inmemory':
-        return InMemoryLogger(**kwargs_dict)
+        return InMemoryLogger(**kwargs)
     else:
         raise ValueError(f'Not sure how to build logger: {name}')
 
diff --git a/scripts/train/train.py b/scripts/train/train.py
index c3da1f1d3c..638ad8aaea 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -278,7 +278,8 @@ def main(cfg: DictConfig) -> Trainer:
     logger_configs: Optional[DictConfig] = pop_config(cfg,
                                                       'loggers',
                                                       must_exist=False,
-                                                      default_value=None)
+                                                      default_value=None,
+                                                      convert=True)
     callback_configs: Optional[DictConfig] = pop_config(cfg,
                                                         'callbacks',
                                                         must_exist=False,
diff --git a/tests/utils/test_builders.py b/tests/utils/test_builders.py
index 9be6630075..303afc9b7d 100644
--- a/tests/utils/test_builders.py
+++ b/tests/utils/test_builders.py
@@ -135,14 +135,14 @@ def test_build_logger():
     with pytest.raises(ValueError):
         _ = build_logger('unknown', {})
 
-    logger_cfg = DictConfig({
+    logger_cfg = {
         'project': 'foobar',
         'init_kwargs': {
             'config': {
                 'foo': 'bar',
             }
         }
-    })
+    }
     wandb_logger = build_logger('wandb', logger_cfg)  # type: ignore
     assert isinstance(wandb_logger, WandBLogger)
     assert wandb_logger.project == 'foobar'

From ea0521ca85422d40429ac5f9ab8dcaca9cd3e907 Mon Sep 17 00:00:00 2001
From: Vitaliy Chiley <6439018+vchiley@users.noreply.github.com>
Date: Tue, 23 Jan 2024 15:50:57 -0800
Subject: [PATCH 09/16] Enable QK Group Norm (#869)

* start qkgn

* attn defaults for qk_gn

* impl qk_gn

* Update attention.py

* Update attention.py

* Update test_flash_triton_torch.py

* Update attention.py

* Update test_flash_triton_torch.py

* Update attention.py

* lint

* Update attention.py

* lint

* add avlue error

* Update attention.py

* updt to include low precision groupnorm;

* perf improvement

* Revert "perf improvement"

This reverts commit 2b62d5ecd21e13cb1bcd0883b3f6ebd1229e9d1d.

* Revert "updt to include low precision groupnorm;"

This reverts commit bca1c3383f5d2ea3009d4ee297ccc26db146cf20.
---
 llmfoundry/models/layers/attention.py         | 29 ++++++++++++++-----
 llmfoundry/models/layers/blocks.py            |  1 +
 llmfoundry/models/mpt/configuration_mpt.py    |  1 +
 .../models/layers/test_flash_triton_torch.py  | 15 +++++++---
 4 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index fecd79553f..89f861c3f0 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -516,6 +516,7 @@ def __init__(
         attn_impl: str = 'triton',
         clip_qkv: Optional[float] = None,
         qk_ln: bool = False,
+        qk_gn: bool = False,
         softmax_scale: Optional[float] = None,
         attn_pdrop: float = 0.0,
         norm_type: str = 'low_precision_layernorm',
@@ -529,6 +530,7 @@ def __init__(
         self.attn_impl = attn_impl
         self.clip_qkv = clip_qkv
         self.qk_ln = qk_ln
+        self.qk_gn = qk_gn
 
         self.d_model = d_model
         self.n_heads = n_heads
@@ -549,6 +551,8 @@ def __init__(
             raise ValueError(
                 'Each Q head should get the same number of KV heads, so n_heads must be divisible by kv_n_heads.'
             )
+        if qk_ln and qk_gn:
+            raise ValueError('Only one of qk_ln and qk_gn can be set to True.')
 
         self.softmax_scale = softmax_scale
         if self.softmax_scale is None:
@@ -572,11 +576,13 @@ def __init__(
         ]
         self.Wqkv._fused = (0, fuse_splits)
 
-        if self.qk_ln:
+        if self.qk_ln or self.qk_gn:
             norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
-            self.q_ln = norm_class(self.d_model, device=device)
-            self.k_ln = norm_class(self.kv_n_heads * self.head_dim,
-                                   device=device)
+            norm_size = self.head_dim if qk_gn else d_model
+            self.q_ln = norm_class(norm_size, device=device)
+            if qk_ln:
+                norm_size = self.head_dim * kv_n_heads
+            self.k_ln = norm_class(norm_size, device=device)
 
         if self.attn_impl == 'flash':
             self.attn_fn = flash_attn_fn
@@ -623,11 +629,16 @@ def forward(
 
         key_padding_mask = attention_mask
 
-        if self.qk_ln:
+        if self.qk_ln or self.qk_gn:
             # Applying layernorm to qk
+            q_shape, k_shape = query.shape, key.shape
+            if self.qk_gn:
+                b, s = query.shape[:2]
+                query = query.view(b, s, self.n_heads, -1)
+                key = key.view(b, s, self.kv_n_heads, -1)
             dtype = query.dtype
-            query = self.q_ln(query).to(dtype)
-            key = self.k_ln(key).to(dtype)
+            query = self.q_ln(query).to(dtype).view(q_shape)
+            key = self.k_ln(key).to(dtype).view(k_shape)
 
         if rotary_emb_w_meta_info is not None:
             rotary_emb = rotary_emb_w_meta_info['rotary_emb']
@@ -712,6 +723,7 @@ def __init__(
         attn_impl: str = 'triton',
         clip_qkv: Optional[float] = None,
         qk_ln: bool = False,
+        qk_gn: bool = False,
         softmax_scale: Optional[float] = None,
         attn_pdrop: float = 0.0,
         norm_type: str = 'low_precision_layernorm',
@@ -727,6 +739,7 @@ def __init__(
             attn_impl=attn_impl,
             clip_qkv=clip_qkv,
             qk_ln=qk_ln,
+            qk_gn=qk_gn,
             softmax_scale=softmax_scale,
             attn_pdrop=attn_pdrop,
             norm_type=norm_type,
@@ -751,6 +764,7 @@ def __init__(
         attn_impl: str = 'triton',
         clip_qkv: Optional[float] = None,
         qk_ln: bool = False,
+        qk_gn: bool = False,
         softmax_scale: Optional[float] = None,
         attn_pdrop: float = 0.0,
         norm_type: str = 'low_precision_layernorm',
@@ -766,6 +780,7 @@ def __init__(
             attn_impl=attn_impl,
             clip_qkv=clip_qkv,
             qk_ln=qk_ln,
+            qk_gn=qk_gn,
             softmax_scale=softmax_scale,
             attn_pdrop=attn_pdrop,
             norm_type=norm_type,
diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
index 036a4e7cd2..4ac43a8bac 100644
--- a/llmfoundry/models/layers/blocks.py
+++ b/llmfoundry/models/layers/blocks.py
@@ -22,6 +22,7 @@
     'attn_pdrop': 0.0,
     'attn_impl': 'triton',
     'qk_ln': False,
+    'qk_gn': False,
     'clip_qkv': None,
     'softmax_scale': None,
     'prefix_lm': False,
diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
index 5474529277..7911728397 100644
--- a/llmfoundry/models/mpt/configuration_mpt.py
+++ b/llmfoundry/models/mpt/configuration_mpt.py
@@ -82,6 +82,7 @@ def __init__(
                 attn_pdrop (float): The dropout probability for the attention layers.
                 attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
                 qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
+                qk_gn (bool): Whether to apply group normalization to the queries and keys in the attention layer.
                 clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
                     this value.
                 softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
diff --git a/tests/models/layers/test_flash_triton_torch.py b/tests/models/layers/test_flash_triton_torch.py
index 2f992cd92f..d409486cc6 100644
--- a/tests/models/layers/test_flash_triton_torch.py
+++ b/tests/models/layers/test_flash_triton_torch.py
@@ -28,7 +28,11 @@ def allclose_helper(t0: torch.Tensor,
     ('triton', 'torch'),
 ])
 @pytest.mark.parametrize('clip_qkv', [True, False])
-@pytest.mark.parametrize('qk_ln', [True, False])
+@pytest.mark.parametrize('qk_ln, qk_gn', [
+    (True, False),
+    (False, True),
+    (False, False),
+])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
     'rope': False
@@ -64,6 +68,7 @@ def test_attn_impl(attn_impl_0: str,
                    attn_impl_1: str,
                    clip_qkv: bool,
                    qk_ln: bool,
+                   qk_gn: bool,
                    pos_emb_config: dict,
                    attn_type: str,
                    attn_uses_sequence_id: bool,
@@ -71,8 +76,8 @@ def test_attn_impl(attn_impl_0: str,
                    device: str = 'cuda'):
     """Compare all attn impl with each other.
 
-    Includes testing with and without attn_clip_qkv, attn_qk_ln, alibi, and
-    rope.
+    Includes testing with and without attn_clip_qkv, attn_qk_ln, attn_qk_gn,
+    alibi, and rope.
     """
     alibi = pos_emb_config['alibi']
     rope = pos_emb_config['rope']
@@ -100,6 +105,7 @@ def test_attn_impl(attn_impl_0: str,
         'attn_pdrop': 0,
         'clip_qkv': clip_qkv,
         'qk_ln': qk_ln,
+        'qk_gn': qk_gn,
     })
 
     n, s, f = 2, 4, cfg.d_model
@@ -269,7 +275,8 @@ def gen_bias(attn_impl: str):
             'rope_impl'] == 'hf'
 
         # special case that (likely) fails due to numerics
-        if clip_qkv and qk_ln and using_hf_rope and attn_type == 'grouped_query_attention':
+        if (clip_qkv and (qk_ln or qk_gn) and using_hf_rope and
+                attn_type == 'grouped_query_attention'):
             assert allclose_helper(p.grad, tp.grad, atol=2.e-2, rtol=2.e-2)
         else:
             assert allclose_helper(p.grad, tp.grad)

From 1469dcb62b19a8b928e4fb491f1945bd54b60f25 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Wed, 24 Jan 2024 11:08:44 -0800
Subject: [PATCH 10/16] patch (#905)

---
 .github/workflows/docker.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
index bb538dbe9b..bbc5e422d3 100644
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@@ -7,7 +7,7 @@ on:
     branches:
     - main
     paths:
-    - ./Dockerfile
+    - Dockerfile
     - .github/workflows/docker.yaml
   workflow_dispatch: {}
 jobs:

From 69bf90edd594995f7a9835ffb155929dc6beb9c4 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 24 Jan 2024 15:15:11 -0800
Subject: [PATCH 11/16] Add new GC option (#907)

---
 llmfoundry/callbacks/scheduled_gc_callback.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/llmfoundry/callbacks/scheduled_gc_callback.py b/llmfoundry/callbacks/scheduled_gc_callback.py
index 6bd085e68f..216fa2adb4 100644
--- a/llmfoundry/callbacks/scheduled_gc_callback.py
+++ b/llmfoundry/callbacks/scheduled_gc_callback.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import gc
+from typing import Optional
 
 import torch
 from composer.core import Callback, State
@@ -19,16 +20,19 @@ class ScheduledGarbageCollector(Callback):
     """Disable automatic garbage collection and collect garbage at interval.
 
     Args:
-        batch_interval (int): Number of batches between checkpoints call to gc.collect()
+        batch_interval (int): Number of batches between calls to gc.collect()
+        gen_1_batch_interval(int, optional): Number of batches between calls to gc.collect(1)
         eval_keep_disabled (bool): keep gc disabled during eval (default: False)
     """
 
     def __init__(
         self,
         batch_interval: int,
+        gen_1_batch_interval: Optional[int] = None,
         eval_keep_disabled: bool = False,
     ):
         self.batch_interval = batch_interval
+        self.gen_1_batch_interval = gen_1_batch_interval
         self.eval_keep_disabled = eval_keep_disabled
         self.gc_init_state = None
 
@@ -56,6 +60,9 @@ def fit_end(self, state: State, logger: Logger) -> None:
     def before_dataloader(self, state: State, logger: Logger) -> None:
         del logger  # unused
 
+        if self.gen_1_batch_interval is not None and state.timestamp.batch.value % self.gen_1_batch_interval == 0:
+            gc.collect(1)
+
         if state.timestamp.batch.value % self.batch_interval == 0:
             gc_cuda()
 

From 2634987909ddc8923b762170cb9aa2870e31d5cc Mon Sep 17 00:00:00 2001
From: Jerry Chen <jerry.chen@databricks.com>
Date: Wed, 24 Jan 2024 17:32:44 -0800
Subject: [PATCH 12/16] No symlinks at all for HF download (#908)

---
 llmfoundry/utils/model_download_utils.py | 1 +
 tests/utils/test_model_download_utils.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py
index 07c84a85c8..bade3e1d7d 100644
--- a/llmfoundry/utils/model_download_utils.py
+++ b/llmfoundry/utils/model_download_utils.py
@@ -108,6 +108,7 @@ def download_from_hf_hub(
     download_start = time.time()
     hf_hub.snapshot_download(model,
                              local_dir=save_dir,
+                             local_dir_use_symlinks=False,
                              ignore_patterns=ignore_patterns,
                              allow_patterns=allow_patterns,
                              token=token)
diff --git a/tests/utils/test_model_download_utils.py b/tests/utils/test_model_download_utils.py
index 14749bdcd9..08e11a3b0e 100644
--- a/tests/utils/test_model_download_utils.py
+++ b/tests/utils/test_model_download_utils.py
@@ -110,6 +110,7 @@ def test_download_from_hf_hub_weights_pref(mock_list_repo_files: MagicMock,
     mock_snapshot_download.assert_called_once_with(
         test_repo_id,
         local_dir=save_dir,
+        local_dir_use_symlinks=False,
         allow_patterns=None,
         ignore_patterns=expected_ignore_patterns,
         token=None)

From ac7835415cf6fa91664e513c7a471f2802cdc8a2 Mon Sep 17 00:00:00 2001
From: Milo Cress <milo.cress@databricks.com>
Date: Fri, 26 Jan 2024 09:47:19 -0500
Subject: [PATCH 13/16] Adds support for chat formatted finetuning input data.
 (#884)

* fix conflicting formatting linting guidelines

* used older union operator for legacy support

* did the same thing in another place

* isort ignore specific lines

* fixes

* isort do not skip line

* address comments

* renamed some more things

* split tests and add some verification for tokenization split

* fix formatting

* added docstrings

* added end-to-end-test with HF dataset

* fix code style

* renamed file and fixed tests

* use chat template diff

* addressed comment

* Update llmfoundry/data/finetuning/tasks.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>

* Update llmfoundry/data/finetuning/tasks.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>

* fixed type of TokenizedExample

* use cast

* use _ALLOWED_{PROMPT, RESPONSE}_KEYS

* updated tests

* fix

* fix?

* Update llmfoundry/data/finetuning/tasks.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>

* Update llmfoundry/data/finetuning/tasks.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>

---------

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 llmfoundry/data/finetuning/tasks.py      | 128 +++++++++++++++++-
 tests/data/test_dataloader.py            |  66 +++------
 tests/data/test_template_tokenization.py | 163 +++++++++++++++++++++++
 3 files changed, 307 insertions(+), 50 deletions(-)
 create mode 100644 tests/data/test_template_tokenization.py

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index e61d138c41..4846e35840 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -36,7 +36,8 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 import os
 import warnings
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import (Any, Callable, Dict, List, Literal, Optional, Tuple, Union,
+                    cast)
 
 import datasets as hf_datasets
 import huggingface_hub as hf_hub
@@ -57,6 +58,35 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
                  '.downloaded_finetuning'))
 SUPPORTED_EXTENSIONS = ['.csv', '.jsonl', '.parquet']
 
+PromptResponseDict = Dict[str, str]
+ChatFormattedDict = Dict[str, List[Dict[str, str]]]
+Example = Union[PromptResponseDict, ChatFormattedDict]
+ExampleType = Literal['prompt_response', 'chat']
+TokenizedExample = Dict[str, List[int]]
+
+
+def _get_example_type(example: Example) -> ExampleType:
+    """Determines the type of the input example.
+
+    Args:
+        example (Example): The input example, which can be a multi-way chat formatted conversation or an instruction-response pair.
+
+    Returns:
+        ExampleType: The type of the input example, which can be either 'chat' for multi-way chat formatted conversation or 'prompt_response' for instruction-response pair.
+
+    Raises:
+        KeyError: If the example type is unknown.
+    """
+    if 'messages' in example:
+        return 'chat'
+    elif any([
+            pr in example
+            for pr in _ALLOWED_PROMPT_KEYS.union(_ALLOWED_RESPONSE_KEYS)
+    ]):
+        return 'prompt_response'
+    else:
+        raise KeyError(f'Unknown conversation type {example=}')
+
 
 def _is_empty_or_nonexistent(dirpath: str) -> bool:
     """Check if a directory is empty or non-existent.
@@ -70,9 +100,70 @@ def _is_empty_or_nonexistent(dirpath: str) -> bool:
     return not os.path.isdir(dirpath) or len(os.listdir(dirpath)) == 0
 
 
-def _tokenize_formatted_example(
-        example: Dict[str, Any],
-        tokenizer: PreTrainedTokenizerBase) -> Dict[str, List[int]]:
+def _slice_chat_formatted_example(
+        example: ChatFormattedDict,
+        tokenizer: PreTrainedTokenizerBase) -> Tuple[str, str]:
+    """Slices the chat example into a formatted prompt and response.
+
+    Args:
+        example (ChatFormattedDict): The chat example containing the messages.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer to apply the chat template.
+
+    Returns:
+        Tuple[str, str]: The prompt and response as separate strings.
+
+    Raises:
+        ValueError: If the chat example has less than two messages or if the last message is not from the assistant.
+        KeyError: If a message does not have a role or content.
+    """
+    messages = example['messages']
+
+    if len(messages) < 2:
+        raise ValueError(
+            f'chat example must have at least two messages. {messages=}')
+    last_message = messages[-1]
+    if last_message['role'] != 'assistant':
+        raise ValueError(
+            f'last message must be from assistant. {last_message=}')
+    for message in messages:
+        if 'role' not in message or 'content' not in message:
+            raise KeyError(f'message must have role and content. {message=}')
+
+    full_conversation = tokenizer.apply_chat_template(messages, tokenize=False)
+    prompt = tokenizer.apply_chat_template(messages[:-1],
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    if prompt != full_conversation[:len(prompt)]:
+        raise ValueError(
+            f'prompt must be the first part of the full conversation. {prompt=}, {full_conversation=}'
+        )
+    response = full_conversation[len(prompt):]
+    if len(response) == 0:
+        raise ValueError(
+            f'chat example must have at least one assistant message. {messages=}'
+        )
+    return prompt, response
+
+
+def _tokenize_chat_formatted_example(
+        example: ChatFormattedDict,
+        tokenizer: PreTrainedTokenizerBase) -> TokenizedExample:
+    """Tokenizes a chat-formatted example using the provided tokenizer.
+
+    Args:
+        example (ChatFormattedDict): The chat-formatted example to tokenize.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer to use for tokenization.
+
+    Returns:
+        TokenizedExample: The tokenized example.
+    """
+    prompt, response = _slice_chat_formatted_example(example, tokenizer)
+    return tokenizer(text=prompt, text_target=response)
+
+
+def _tokenize_prompt_response_formatted_example(
+        example: PromptResponseDict,
+        tokenizer: PreTrainedTokenizerBase) -> TokenizedExample:
     """Tokenize a formatted example and validate expected keys."""
     example_keys = set(example.keys())
     prompt_keys = example_keys.intersection(_ALLOWED_PROMPT_KEYS)
@@ -108,6 +199,35 @@ def _tokenize_formatted_example(
     return tokenizer(text=prompt, text_target=response)
 
 
+def _tokenize_formatted_example(
+        example: Example,
+        tokenizer: PreTrainedTokenizerBase) -> TokenizedExample:
+    """Tokenizes a formatted example using the provided tokenizer.
+
+    Args:
+        example (Example): The input example to be tokenized.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for tokenization.
+
+    Returns:
+        TokenizedExample: The tokenized example.
+
+    Raises:
+        ValueError: If the example format is unknown.
+    """
+    example_format = _get_example_type(example)
+
+    if example_format == 'chat':
+        chat_example = cast(ChatFormattedDict, example)
+        return _tokenize_chat_formatted_example(chat_example, tokenizer)
+    elif example_format == 'prompt_response':
+        prompt_response_example: PromptResponseDict = cast(
+            PromptResponseDict, example)
+        return _tokenize_prompt_response_formatted_example(
+            prompt_response_example, tokenizer)
+    else:
+        raise ValueError(f'Unknown conversation type {example_format=}')
+
+
 class StreamingFinetuningDataset(StreamingDataset):
     """Finetuning dataset with flexible tokenization using StreamingDataset.
 
diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py
index 44d0442a87..73e8427505 100644
--- a/tests/data/test_dataloader.py
+++ b/tests/data/test_dataloader.py
@@ -23,11 +23,8 @@
 from llmfoundry import (build_finetuning_dataloader,
                         build_text_denoising_dataloader)
 from llmfoundry.data import build_dataloader
-from llmfoundry.data.finetuning.tasks import (_ALLOWED_PROMPT_KEYS,
-                                              _ALLOWED_RESPONSE_KEYS,
-                                              DOWNLOADED_FT_DATASETS_DIRPATH,
-                                              SUPPORTED_EXTENSIONS,
-                                              _tokenize_formatted_example)
+from llmfoundry.data.finetuning.tasks import (DOWNLOADED_FT_DATASETS_DIRPATH,
+                                              SUPPORTED_EXTENSIONS)
 from llmfoundry.data.text_data import (ConcatenatedSequenceCollatorWrapper,
                                        build_text_dataloader,
                                        get_tokens_per_batch_func)
@@ -249,10 +246,12 @@ def test_denoising_dataloader(decoder_only_format: bool, pretokenize: bool,
                 break
 
 
+@pytest.mark.parametrize('use_chat_formatting', [True, False])
 @pytest.mark.parametrize('decoder_only_format', [True, False])
 @pytest.mark.parametrize('allow_pad_trimming', [True, False])
 @pytest.mark.parametrize('packing_ratio', [10.0, None, 'auto'])
-def test_finetuning_dataloader(decoder_only_format: bool,
+def test_finetuning_dataloader(use_chat_formatting: bool,
+                               decoder_only_format: bool,
                                allow_pad_trimming: bool,
                                packing_ratio: Optional[Union[float,
                                                              Literal['auto']]]):
@@ -265,13 +264,21 @@ def test_finetuning_dataloader(decoder_only_format: bool,
     cfg = {
         'name': 'finetuning',
         'dataset': {
-            'hf_name': 'HuggingFaceH4/databricks_dolly_15k',
-            'split': 'train',
-            'max_seq_len': max_seq_len,
-            'decoder_only_format': decoder_only_format,
-            'allow_pad_trimming': allow_pad_trimming,
-            'packing_ratio': packing_ratio,
-            'shuffle': True,
+            'hf_name':
+                'iamroot/chat_formatted_examples' if use_chat_formatting else
+                'HuggingFaceH4/databricks_dolly_15k',
+            'split':
+                'train',
+            'max_seq_len':
+                max_seq_len,
+            'decoder_only_format':
+                decoder_only_format,
+            'allow_pad_trimming':
+                allow_pad_trimming,
+            'packing_ratio':
+                packing_ratio,
+            'shuffle':
+                True,
         },
         'drop_last': False,
         'num_workers': 0,
@@ -417,39 +424,6 @@ def test_finetuning_dataloader_small_data(dataset_size: int,
         shutil.rmtree(tiny_dataset_folder_path)
 
 
-def test_tokenize_example_malformed():
-    no_keys = {}
-    no_prompt_key = {'response': 'response'}
-    no_response_key = {'prompt': 'prompt'}
-    extra_keys_with_prompt = {'prompt': 'prompt', 'extra': 'extra'}
-    extra_keys_with_response = {'response': 'response', 'extra': 'extra'}
-    multiple_allowed_response_keys = {
-        'prompt': 'prompt',
-        'response': 'response',
-        'completion': 'completion'
-    }
-
-    malformed_examples = [
-        no_keys, no_prompt_key, no_response_key, extra_keys_with_prompt,
-        extra_keys_with_response, multiple_allowed_response_keys
-    ]
-
-    for example in malformed_examples:
-        with pytest.raises(KeyError):
-            _tokenize_formatted_example(example, MagicMock())
-
-
-def test_tokenize_example_well_formed():
-    tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')
-
-    for prompt_key in _ALLOWED_PROMPT_KEYS:
-        for response_key in _ALLOWED_RESPONSE_KEYS:
-            example = {prompt_key: 'prompt', response_key: 'response'}
-            tokenized_example = _tokenize_formatted_example(example, tokenizer)
-            assert 'input_ids' in tokenized_example
-            assert 'labels' in tokenized_example
-
-
 @pytest.mark.parametrize('split', ['train', 'custom', 'data'])
 def test_finetuning_dataloader_custom_split(tmp_path: pathlib.Path, split: str):
     tokenizer_name = 'gpt2'
diff --git a/tests/data/test_template_tokenization.py b/tests/data/test_template_tokenization.py
new file mode 100644
index 0000000000..258e491b32
--- /dev/null
+++ b/tests/data/test_template_tokenization.py
@@ -0,0 +1,163 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from unittest.mock import MagicMock
+
+import pytest
+import transformers
+
+from llmfoundry.data.finetuning.tasks import (_ALLOWED_PROMPT_KEYS,
+                                              _ALLOWED_RESPONSE_KEYS,
+                                              _slice_chat_formatted_example,
+                                              _tokenize_formatted_example)
+from llmfoundry.utils.builders import build_tokenizer
+
+
+def test_tokenize_chat_example_malformed():
+    no_content = {'messages': [{'role': 'user'}]}
+    too_few_messages = {
+        'messages': [{
+            'role': 'assistant',
+            'content': 'Hi, User!'
+        }]
+    }
+    ends_with_user_role = {
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello GPT!'
+        }, {
+            'role': 'assistant',
+            'content': 'Hi, User!'
+        }, {
+            'role': 'user',
+            'content': 'user message not followed by an assistant label'
+        }]
+    }
+    no_assistant_message = {
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello GPT!'
+        }, {
+            'role': 'user',
+            'content': 'user message not followed by an assistant label'
+        }]
+    }
+    malformed_chat_examples = [
+        too_few_messages, no_content, ends_with_user_role, no_assistant_message
+    ]
+    my_tokenizer = build_tokenizer('mosaicml/mpt-7b-8k-chat', {})
+    for example in malformed_chat_examples:
+        with pytest.raises(Exception):
+            _tokenize_formatted_example(
+                example, my_tokenizer
+            )  # type: ignore (the typing here is supposed to be malformed)
+
+
+def test_tokenize_chat_example_well_formed():
+    chat_examples = [
+        {
+            'messages': [{
+                'role': 'user',
+                'content': 'Hello, GPT'
+            }, {
+                'role': 'assistant',
+                'content': 'this is my response'
+            }]
+        },  # prompt/response but in chat format
+        {
+            'messages': [
+                {
+                    'role': 'user',
+                    'content': 'Hello, GPT'
+                },
+                {
+                    'role': 'assistant',
+                    'content': 'this is my response'
+                },
+                {
+                    'role': 'user',
+                    'content': 'Nice to hear that.'
+                },
+                {
+                    'role': 'assistant',
+                    'content': 'multi-way chat works too!'
+                },
+            ]
+        },  # multi-way chat
+    ]
+
+    expected = [
+        {
+            'prompt':
+                '''<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.
+<|im_start|>user
+Hello, GPT<|im_end|>
+<|im_start|>assistant
+''',
+            'response':
+                'this is my response<|im_end|>'
+        },
+        {
+            'prompt':
+                '''<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.
+<|im_start|>user
+Hello, GPT<|im_end|>
+<|im_start|>assistant
+this is my response<|im_end|>
+<|im_start|>user
+Nice to hear that.<|im_end|>
+<|im_start|>assistant
+''',
+            'response':
+                'multi-way chat works too!<|im_end|>'
+        },
+    ]
+
+    chat_tokenizer = build_tokenizer('mosaicml/mpt-7b-8k-chat', {})
+    assert len(expected) == len(
+        chat_examples)  # if we add a new example, zip shouldn't fail silently
+    for chat_example, expected_stringification in zip(chat_examples, expected):
+        prompt, response = _slice_chat_formatted_example(
+            chat_example, chat_tokenizer)
+        tokenized_example = _tokenize_formatted_example(chat_example,
+                                                        chat_tokenizer)
+        assert prompt == expected_stringification['prompt']
+        assert response == expected_stringification['response']
+        assert 'input_ids' in tokenized_example
+        assert 'labels' in tokenized_example
+
+
+def test_tokenize_instruct_example_malformed():
+    no_keys = {}
+    no_prompt_key = {'response': 'response'}
+    no_response_key = {'prompt': 'prompt'}
+    extra_keys_with_prompt = {'prompt': 'prompt', 'extra': 'extra'}
+    extra_keys_with_response = {'response': 'response', 'extra': 'extra'}
+    multiple_allowed_response_keys = {
+        'prompt': 'prompt',
+        'response': 'response',
+        'completion': 'completion'
+    }
+
+    malformed_prompt_response_examples = [
+        no_keys, no_prompt_key, no_response_key, extra_keys_with_prompt,
+        extra_keys_with_response, multiple_allowed_response_keys
+    ]
+
+    for example in malformed_prompt_response_examples:
+        with pytest.raises(KeyError):
+            _tokenize_formatted_example(example, MagicMock())
+
+
+def test_tokenize_instruct_example_well_formed():
+    tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')
+
+    for prompt_key in _ALLOWED_PROMPT_KEYS:
+        for response_key in _ALLOWED_RESPONSE_KEYS:
+
+            example = {prompt_key: 'prompt', response_key: 'response'}
+            tokenized_example = _tokenize_formatted_example(example, tokenizer)
+            assert 'input_ids' in tokenized_example
+            assert 'labels' in tokenized_example

From 534f5b4d415e3c30eea142229ecd642e368adfa4 Mon Sep 17 00:00:00 2001
From: Nicholas Garcia <nicholasgcgarcia@gmail.com>
Date: Fri, 26 Jan 2024 13:49:23 -0800
Subject: [PATCH 14/16] Add flag to enable/disable param upload (#912)

* Add flag to enable/disable param upload

* Yapf

* Apply suggestions from code review

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>

* Rename

* Add to eval

---------

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 scripts/eval/eval.py   | 13 ++++++++++---
 scripts/train/train.py |  9 +++++++--
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
index 8dbe91e6d2..c783a4f513 100644
--- a/scripts/eval/eval.py
+++ b/scripts/eval/eval.py
@@ -121,6 +121,7 @@ def evaluate_model(
     icl_subset_num_batches: Optional[int],
     metadata: Optional[Dict[str, str]],
     logged_config: DictConfig,
+    should_log_config: bool = True,
 ):
 
     log.info(f'Evaluating model: {model_cfg.model_name}')
@@ -215,8 +216,9 @@ def evaluate_model(
         python_log_level=python_log_level,
     )
 
-    log.info('Evaluation config:')
-    log_config(logged_config)
+    if should_log_config:
+        log.info('Evaluation config:')
+        log_config(logged_config)
 
     log.info(f'Starting eval for {model_cfg.model_name}...')
     if torch.cuda.is_available():
@@ -306,6 +308,10 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
                                                     must_exist=False,
                                                     default_value=None,
                                                     convert=True)
+    should_log_config: bool = pop_config(cfg,
+                                         'log_config',
+                                         must_exist=False,
+                                         default_value=True)
 
     # Pop out interpolation variables.
     pop_config(cfg, 'model_name_or_path', must_exist=False, default_value=None)
@@ -352,7 +358,8 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
              eval_gauntlet_df=eval_gauntlet_df,
              icl_subset_num_batches=icl_subset_num_batches,
              metadata=metadata,
-             logged_config=logged_cfg)
+             logged_config=logged_cfg,
+             should_log_config=should_log_config)
         trainers.append(trainer)
 
         if eval_gauntlet_callback is not None:
diff --git a/scripts/train/train.py b/scripts/train/train.py
index 638ad8aaea..f28f8718ba 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -392,6 +392,10 @@ def main(cfg: DictConfig) -> Trainer:
                                                     must_exist=False,
                                                     default_value=None,
                                                     convert=True)
+    should_log_config: bool = pop_config(cfg,
+                                         'log_config',
+                                         must_exist=False,
+                                         default_value=True)
 
     # Enable autoresume from model checkpoints if possible
     autoresume_default: bool = False
@@ -622,8 +626,9 @@ def main(cfg: DictConfig) -> Trainer:
         compile_config=compile_config,
     )
 
-    log.info('Logging config')
-    log_config(logged_cfg)
+    if should_log_config:
+        log.info('Logging config')
+        log_config(logged_cfg)
     torch.cuda.empty_cache()
     gc.collect()
 

From bdcce63781596d88b02d813c1e0ff875f67bd12b Mon Sep 17 00:00:00 2001
From: Anna <anna@mosaicml.com>
Date: Fri, 26 Jan 2024 17:20:03 -0800
Subject: [PATCH 15/16] Add support for eval_loader & eval_subset_num_batches
 in async callback (#834)

* Skip evalloader in training if using async eval

* add support for subset_num_batches

* remove todo

* eval first

* rename arg

* fix

* small updates

* om

* fix test

* eval run config

---------

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 llmfoundry/callbacks/async_eval_callback.py | 78 ++++++++++++++++-----
 llmfoundry/utils/builders.py                |  2 +-
 scripts/eval/eval.py                        | 17 +++--
 scripts/train/train.py                      | 46 ++++++------
 tests/callbacks/test_async_eval_callback.py | 54 ++++++++++----
 5 files changed, 139 insertions(+), 58 deletions(-)

diff --git a/llmfoundry/callbacks/async_eval_callback.py b/llmfoundry/callbacks/async_eval_callback.py
index 4227448d87..6cd57d440c 100644
--- a/llmfoundry/callbacks/async_eval_callback.py
+++ b/llmfoundry/callbacks/async_eval_callback.py
@@ -19,7 +19,7 @@
 from composer.utils import dist
 from composer.utils.misc import create_interval_scheduler
 
-from mcli import ComputeConfig, Run, RunConfig, create_run, get_run
+from mcli import Run, RunConfig, create_run, get_run
 
 log = logging.getLogger(__name__)
 
@@ -33,7 +33,9 @@
 OPTIONAL_PARAMS_FOR_EVAL = {
     'dist_timeout',
     'eval_gauntlet',
+    'eval_loader',
     'fsdp_config',
+    'eval_subset_num_batches',
     'icl_subset_num_batches',
     'loggers',
     'precision',
@@ -175,50 +177,84 @@ def validate_interval(interval: Union[str, int, Time],
     return async_interval
 
 
+def validate_eval_run_config(
+        eval_run_config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+
+    if not eval_run_config:
+        return {}
+
+    run_config = eval_run_config.copy()
+
+    supported_keys = {'image', 'command', 'compute', 'scheduling'}
+    found_unsupported = set()
+    for key in run_config:
+        if key not in supported_keys:
+            found_unsupported.add(key)
+
+    if found_unsupported:
+        raise ValueError(
+            f'Unsupported eval run config keys found: {", ".join(found_unsupported)}'
+            + f'. Supported keys: {supported_keys}')
+
+    return run_config
+
+
 class AsyncEval(Callback):
     """Run the eval loop asynchronously as part of a MosaicML platform run.
 
     This callback is currently experimental. The API may change in the future.
 
     Args:
-        training_config: Dict[str, Any]: The config from the training run
+        training_params: Dict[str, Any]: The parameter config from the training run
         interval: Union[str, int, Time]: The interval describing how often eval runs should be
             launched. If an integer, it will be assumed to be in :attr:`.TimeUnit.EPOCH`.
             Otherwise, the unit must be either :attr:`.TimeUnit.EPOCH`, :attr:`.TimeUnit.BATCH`,
             :attr:`.TimeUnit.TOKEN`, or :attr:`.TimeUnit.SAMPLE`.
-        compute: Optional[Union[ComputeConfig, Dict[str, Any]]]: The compute configuration to
-            use for the eval run. If not provided, the same cluster as the current run and a
-            single, full GPU node will be used.
+        eval_run_config: Optional[Dict[str, Any]]: A subset of mcli run config values to use
+            for the eval run. If not specified, any fields from run config will be created
+            dynamically from the training run config and the interval. The following fields
+            are supported:
+            - ``image``: Image of the eval run. Default: same as training run
+            - ``command``: Command to run for the eval run. Default: calls
+                `composer scripts/eval/eval.py $PARAMETERS`. If custom setup is needed,
+                the command should include calling the eval script with $PARAMETERS
+            - ``compute``: Compute to use for the eval run. Default: same cluster as
+                the training run and a single node (8 GPUs)
+            - ``scheduling``: Scheduling to use for the eval run. Default: same as training run
+
+            All fields are optional, but if specified, must be valid for a mcli run config. We
+            provide this optional config to give you the most flexibility in customizing the eval
+            run, but it is recommended to use the default values unless you have a specific use case
     """
 
     def __init__(
         self,
-        training_config: Dict[str, Any],
+        training_params: Dict[str, Any],
         interval: Union[str, int, Time],
-        compute: Optional[Union[ComputeConfig, Dict[str, Any]]] = None,
+        eval_run_config: Optional[Dict[str, Any]] = None,
     ):
 
         for required in ('save_interval', 'save_folder'):
-            if required not in training_config:
+            if required not in training_params:
                 raise ValueError(f'{required} required for async eval')
 
-        self.checkpoint_save_folder = training_config['save_folder']
-        self.training_config = training_config
+        self.checkpoint_save_folder = training_params['save_folder']
+        self.training_params = training_params
+        self.eval_run_config = validate_eval_run_config(eval_run_config)
         self.interval = validate_interval(interval,
-                                          self.training_config['save_interval'])
+                                          self.training_params['save_interval'])
         self.check_interval = create_interval_scheduler(
             interval,
             # There is a custom close to ensure that the final checkpoint
             # (which is the most important) is evaled after it is written
             include_end_of_training=False,
         )
-        self.compute = compute
         self.last_checkpoint: Optional[str] = None
 
         # Run these during init to fail fast in any of the error cases
         self.current_run = self._get_current_run()
         get_eval_parameters(
-            parameters=training_config,
+            parameters=training_params,
             checkpoint='test',
             training_run_name=self.current_run.name,
         )
@@ -259,7 +295,7 @@ def close(self, state: State, logger: Logger) -> None:
         if dist.get_global_rank() != 0:
             return
 
-        save_latest_filename = self.training_config.get('save_latest_filename',
+        save_latest_filename = self.training_params.get('save_latest_filename',
                                                         None)
 
         if not save_latest_filename:
@@ -297,7 +333,7 @@ def launch_run(self, checkpoint: str, current_interval: Time) -> Run:
         run_name = get_run_name(self.current_run.name, str(current_interval))
 
         params = get_eval_parameters(
-            parameters=self.training_config,
+            parameters=self.training_params,
             checkpoint=checkpoint,
             training_run_name=self.current_run.name,
         )
@@ -347,12 +383,16 @@ def launch_run(self, checkpoint: str, current_interval: Time) -> Run:
         # TODO: This just runs an eval run, but we also want to attach the
         # deployment, which would require a hf conversion and parametrizing the
         # dependent_deployment in the run config
-        command = f'cd {installation_path}/scripts \n composer eval/eval.py $PARAMETERS'
+        default_command = f'cd {installation_path}/scripts \n composer eval/eval.py $PARAMETERS'
         run_config = RunConfig(
             name=run_name,
-            image=self.current_run.image,
-            compute=self.compute or default_compute,
-            command=command,
+            image=self.eval_run_config.get('image', self.current_run.image),
+            command=self.eval_run_config.get('command', default_command),
+            compute=self.eval_run_config.get('compute', default_compute),
+            scheduling=self.eval_run_config.get(
+                'scheduling',
+                self.current_run.submitted_config.scheduling,
+            ),
             integrations=integrations,
             env_variables=cfg.env_variables,
             metadata=cfg.metadata,
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 29642381f8..42f817b386 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -213,7 +213,7 @@ def build_callback(
             raise ValueError(
                 'Parameters config is required for async eval callback')
 
-        return AsyncEval(**kwargs, training_config=config)
+        return AsyncEval(**kwargs, training_params=config)
     else:
         raise ValueError(f'Not sure how to build callback: {name}')
 
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
index c783a4f513..d4ba39acfa 100644
--- a/scripts/eval/eval.py
+++ b/scripts/eval/eval.py
@@ -118,6 +118,7 @@ def evaluate_model(
     python_log_level: Optional[str],
     precision: str,
     eval_gauntlet_df: Optional[pd.DataFrame],
+    eval_subset_num_batches: int,
     icl_subset_num_batches: Optional[int],
     metadata: Optional[Dict[str, str]],
     logged_config: DictConfig,
@@ -224,7 +225,8 @@ def evaluate_model(
     if torch.cuda.is_available():
         torch.cuda.synchronize()
     a = time.time()
-    trainer.eval(eval_dataloader=evaluators)
+    trainer.eval(eval_dataloader=evaluators,
+                 subset_num_batches=eval_subset_num_batches)
     if torch.cuda.is_available():
         torch.cuda.synchronize()
     b = time.time()
@@ -299,10 +301,14 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
                                              'loggers',
                                              must_exist=False,
                                              default_value={})
-    icl_subset_num_batches: int = pop_config(cfg,
-                                             'icl_subset_num_batches',
-                                             must_exist=False,
-                                             default_value=None)
+    eval_subset_num_batches: int = pop_config(cfg,
+                                              'eval_subset_num_batches',
+                                              must_exist=False,
+                                              default_value=-1)
+    icl_subset_num_batches: Optional[int] = pop_config(cfg,
+                                                       'icl_subset_num_batches',
+                                                       must_exist=False,
+                                                       default_value=None)
     metadata: Optional[Dict[str, str]] = pop_config(cfg,
                                                     'metadata',
                                                     must_exist=False,
@@ -356,6 +362,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
              python_log_level=python_log_level,
              precision=precision,
              eval_gauntlet_df=eval_gauntlet_df,
+             eval_subset_num_batches=eval_subset_num_batches,
              icl_subset_num_batches=icl_subset_num_batches,
              metadata=metadata,
              logged_config=logged_cfg,
diff --git a/scripts/train/train.py b/scripts/train/train.py
index f28f8718ba..7bb5e71394 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -283,7 +283,8 @@ def main(cfg: DictConfig) -> Trainer:
     callback_configs: Optional[DictConfig] = pop_config(cfg,
                                                         'callbacks',
                                                         must_exist=False,
-                                                        default_value=None)
+                                                        default_value=None,
+                                                        convert=True)
     algorithm_configs: Optional[DictConfig] = pop_config(cfg,
                                                          'algorithms',
                                                          must_exist=False,
@@ -519,8 +520,7 @@ def main(cfg: DictConfig) -> Trainer:
         for name, callback_cfg in callback_configs.items()
     ] if callback_configs else []
 
-    use_async_eval = any(
-        isinstance(callback, AsyncEval) for callback in callbacks)
+    use_async_eval = any(isinstance(c, AsyncEval) for c in callbacks)
 
     # Algorithms
     algorithms = [
@@ -540,22 +540,28 @@ def main(cfg: DictConfig) -> Trainer:
         mosaicml_logger.log_metrics({'data_validated': time.time()})
 
     ## Evaluation
-    log.info('Building eval loader...')
-    eval_icl_seq_len: int = icl_seq_len if icl_seq_len else max_seq_len
-    # TODO: evaluators should not be built at all if use_async_eval is True
-    # This will be fixed when eval_loader support is fully added to AsyncEval
-    evaluators, _, eval_gauntlet_callback = build_evaluators(
-        eval_loader_config,
-        icl_tasks_config if not use_async_eval else None,
-        eval_gauntlet_config if not use_async_eval else None,
-        tokenizer=tokenizer,
-        device_eval_batch_size=device_eval_batch_size,
-        icl_seq_len=eval_icl_seq_len,
-        icl_subset_num_batches=icl_subset_num_batches,
-    )
-
-    if eval_gauntlet_callback is not None and not use_async_eval:
-        callbacks.append(eval_gauntlet_callback)
+    if use_async_eval:
+        evaluators = []
+        if eval_first:
+            warnings.warn(
+                'AsyncEval callback does not support eval_first=True. Ignoring.'
+            )
+            eval_first = False
+
+    else:
+        log.info('Building eval loader...')
+        eval_icl_seq_len: int = icl_seq_len if icl_seq_len else max_seq_len
+        evaluators, _, eval_gauntlet_callback = build_evaluators(
+            eval_loader_config,
+            icl_tasks_config,
+            eval_gauntlet_config,
+            tokenizer=tokenizer,
+            device_eval_batch_size=device_eval_batch_size,
+            icl_seq_len=eval_icl_seq_len,
+            icl_subset_num_batches=icl_subset_num_batches,
+        )
+        if eval_gauntlet_callback is not None:
+            callbacks.append(eval_gauntlet_callback)
 
     # Build Model
     log.info('Initializing model...')
@@ -582,7 +588,7 @@ def main(cfg: DictConfig) -> Trainer:
     optimizer = build_optimizer(model, optimizer_name, optimizer_config)
 
     # Now add the eval metrics
-    if eval_loader_config is not None:
+    if eval_loader_config is not None and not use_async_eval:
         train_metrics = model.get_metrics(is_train=True)
         evaluators = add_metrics_to_eval_loaders(evaluators, train_metrics)
 
diff --git a/tests/callbacks/test_async_eval_callback.py b/tests/callbacks/test_async_eval_callback.py
index b3a1e98f79..92cb738d9c 100644
--- a/tests/callbacks/test_async_eval_callback.py
+++ b/tests/callbacks/test_async_eval_callback.py
@@ -11,10 +11,10 @@
 from llmfoundry.callbacks.async_eval_callback import (AsyncEval,
                                                       get_eval_parameters,
                                                       get_run_name,
+                                                      validate_eval_run_config,
                                                       validate_interval)
 from mcli import Run, RunConfig, RunStatus
 
-# here
 RUN_NAME = 'foo_bar-1234'
 BASIC_PARAMS = {
     'save_interval': '1ba',
@@ -191,6 +191,29 @@ def test_validate_interval():
     assert validate_interval('2ep', two_epochs) == two_epochs
 
 
+def test_validate_eval_run_config():
+    assert validate_eval_run_config(None) == {}
+    assert validate_eval_run_config({}) == {}
+
+    with pytest.raises(ValueError):
+        validate_eval_run_config({'foo': 'bar'})
+
+    valid_config = {
+        'image': 'example_image',
+        'command': 'example_command',
+        'compute': {
+            'gpus': 1,
+            'cluster': 'example_cluster',
+        },
+        'scheduling': {
+            'priority': 'high',
+            'preemptible': True,
+        },
+    }
+    res = validate_eval_run_config(valid_config)
+    assert res == valid_config
+
+
 FAKE_RUN = Run(
     run_uid='123',
     name=RUN_NAME,
@@ -223,12 +246,16 @@ def test_validate_interval():
        return_value=FAKE_RUN)
 def test_async_eval_callback_minimal(mock_create_run: MagicMock,
                                      mock_get_run: MagicMock):
-    callback = AsyncEval(BASIC_PARAMS,
-                         interval='2ba',
-                         compute={
-                             'cluster': 'c2z3',
-                             'nodes': 2,
-                         })
+    callback = AsyncEval(
+        BASIC_PARAMS,
+        interval='2ba',
+        eval_run_config={
+            'compute': {
+                'cluster': 'c2z3',
+                'nodes': 2,
+            },
+        },
+    )
     assert callback.current_run.name == RUN_NAME
     assert mock_get_run.call_count == 1
     assert mock_get_run.call_args[0][0] == RUN_NAME
@@ -310,12 +337,13 @@ def test_async_eval_callback_minimal(mock_create_run: MagicMock,
        return_value=FAKE_RUN_WITH_INTEGRATIONS)
 def test_async_eval_callback_integrations(mock_create_run: MagicMock,
                                           mock_get_run: MagicMock):
-    callback = AsyncEval(BASIC_PARAMS,
-                         interval='2ba',
-                         compute={
-                             'cluster': 'c2z3',
-                             'nodes': 2,
-                         })
+    callback = AsyncEval(
+        BASIC_PARAMS,
+        interval='2ba',
+        eval_run_config={'compute': {
+            'cluster': 'c2z3',
+            'nodes': 2,
+        }})
     assert mock_get_run.call_count == 1
 
     callback.launch_run('checkpoint/path', Time(1, TimeUnit.BATCH))

From 34cdaf68fde5e92f42b41ae0e68d45ccc22e0d8a Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Sun, 28 Jan 2024 21:59:18 -0800
Subject: [PATCH 16/16] Add the model license file for mlflow (#915)

---
 llmfoundry/callbacks/hf_checkpointer.py       | 27 +++++++++++++++++++
 setup.py                                      |  3 ++-
 .../inference/test_convert_composer_to_hf.py  | 25 ++++++++++++++---
 3 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index b50d81d09e..904f2e208f 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -6,6 +6,7 @@
 import logging
 import math
 import os
+import re
 import tempfile
 from pathlib import Path
 from typing import Optional, Sequence, Union
@@ -27,6 +28,23 @@
 
 log = logging.getLogger(__name__)
 
+_LICENSE_FILE_PATTERN = re.compile(r'license(\.[a-z]+|$)', re.IGNORECASE)
+
+
+def _maybe_get_license_filename(local_dir: str) -> Optional[str]:
+    """Returns the name of the license file if it exists in the local_dir.
+
+    Note: This is intended to be consistent with the code in MLflow.
+    https://github.com/mlflow/mlflow/blob/5d13d6ec620a02de9a5e31201bf1becdb9722ea5/mlflow/transformers/__init__.py#L1152
+
+    If the license file does not exist, returns None.
+    """
+    try:
+        return next(file for file in os.listdir(local_dir)
+                    if _LICENSE_FILE_PATTERN.search(file))
+    except StopIteration:
+        return None
+
 
 class HuggingFaceCheckpointer(Callback):
     """Save a huggingface formatted checkpoint during training.
@@ -279,6 +297,15 @@ def _save_checkpoint(self, state: State, logger: Logger):
                             path=local_save_path,
                             **self.mlflow_logging_config,
                         )
+
+                        license_filename = _maybe_get_license_filename(
+                            local_save_path)
+                        if license_filename is not None:
+                            mlflow_logger._mlflow_client.log_artifact(
+                                mlflow_logger._run_id,
+                                os.path.join(local_save_path, license_filename),
+                            )
+
                         mlflow_logger.register_model(
                             model_uri=local_save_path,
                             name=self.mlflow_registered_model_name,
diff --git a/setup.py b/setup.py
index 511e665ed4..e5bc7e81d2 100644
--- a/setup.py
+++ b/setup.py
@@ -50,7 +50,8 @@
 ]
 
 install_requires = [
-    'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.17.2,<0.18',
+    'mosaicml[libcloud,wandb,oci,gcs]>=0.17.2,<0.18',
+    'mlflow>=2.10,<3',
     'accelerate>=0.25,<0.26',  # for HF inference `device_map`
     'transformers>=4.37,<4.38',
     'mosaicml-streaming>=0.7.2,<0.8',
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index deed181475..e85cdb213c 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -6,7 +6,7 @@
 import pathlib
 import shutil
 from argparse import Namespace
-from typing import Callable, Optional, cast
+from typing import Any, Callable, Optional, cast
 from unittest.mock import ANY, MagicMock, patch
 
 import pytest
@@ -22,6 +22,7 @@
 
 from llmfoundry import COMPOSER_MODEL_REGISTRY
 from llmfoundry.callbacks import HuggingFaceCheckpointer
+from llmfoundry.callbacks.hf_checkpointer import _maybe_get_license_filename
 from llmfoundry.data.finetuning import build_finetuning_dataloader
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
 from llmfoundry.utils.builders import build_optimizer, build_tokenizer
@@ -29,6 +30,10 @@
 from tests.data_utils import make_tiny_ft_dataset
 
 
+def _save_model_mock(*args: Any, path: str, **kwargs: Any):
+    os.makedirs(path, exist_ok=True)
+
+
 def check_hf_tokenizer_equivalence(tokenizer1: PreTrainedTokenizerBase,
                                    tokenizer2: PreTrainedTokenizerBase):
     """WARNING: Parameters are updated within the check so don't call check_hf_tokenizer_equivalence on the same
@@ -297,7 +302,7 @@ def test_huggingface_conversion_callback_interval(
 
     mlflow_logger_mock = MagicMock(spec=MLFlowLogger)
     mlflow_logger_mock.state_dict = lambda *args, **kwargs: {}
-    mlflow_logger_mock.save_model = MagicMock()
+    mlflow_logger_mock.save_model = MagicMock(wraps=_save_model_mock)
     mlflow_logger_mock.register_model = MagicMock()
     mlflow_logger_mock.model_registry_prefix = ''
     mlflow_logger_mock._experiment_id = 'mlflow-experiment-id'
@@ -533,7 +538,7 @@ def test_huggingface_conversion_callback(
 
     mlflow_logger_mock = MagicMock(spec=MLFlowLogger)
     mlflow_logger_mock.state_dict = lambda *args, **kwargs: {}
-    mlflow_logger_mock.save_model = MagicMock()
+    mlflow_logger_mock.save_model = MagicMock(wraps=_save_model_mock)
     mlflow_logger_mock.register_model = MagicMock()
     mlflow_logger_mock.model_registry_prefix = ''
     mlflow_logger_mock._experiment_id = 'mlflow-experiment-id'
@@ -817,3 +822,17 @@ def test_convert_and_generate_meta(tie_word_embeddings: str,
         assert torch.allclose(p1, p2)
 
     delete_transformers_cache()
+
+
+@pytest.mark.parametrize(
+    'license_file_name',
+    ['LICENSE', 'LICENSE.txt', 'license', 'license.md', None])
+def test_license_file_finder(tmp_path: pathlib.Path,
+                             license_file_name: Optional[str]):
+    if license_file_name is not None:
+        with open(os.path.join(tmp_path, license_file_name), 'w') as f:
+            f.write('test')
+
+    found_path = _maybe_get_license_filename(str(tmp_path))
+    assert (found_path == license_file_name
+           ) if license_file_name is not None else (found_path is None)