Merge branch 'main' into seed-sampler

mosaicml · Nov 29, 2024 · 74abd92 · 74abd92
2 parents 68c0d6c + 41e02d7
commit 74abd92
Show file tree

Hide file tree

Showing 39 changed files with 592 additions and 251 deletions.
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -17,11 +17,11 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: "2.4.0_cu124"
-          base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+        - name: "2.5.1_cu124"
+          base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04
           dep_groups: "[all]"
-        - name: "2.4.0_cu124_aws"
-          base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
+        - name: "2.5.1_cu124_aws"
+          base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04-aws
           dep_groups: "[all]"
     steps:
 

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -21,9 +21,9 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: "cpu-2.4.0"
+        - name: "cpu-2.5.1"
           pip_deps: "[all-cpu]"
-          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.5.1_cpu-python3.11-ubuntu22.04
           markers: "not gpu"
           pytest_command: "coverage run -m pytest"
     steps:

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -22,8 +22,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
-        - name: "gpu-2.4.0-1"
-          container: mosaicml/llm-foundry:2.4.0_cu124-latest
+        - name: "gpu-2.5.1-1"
+          container: mosaicml/llm-foundry:2.5.1_cu124-latest
           markers: "gpu"
           pip_deps: "[all]"
           pytest_command: "coverage run -m pytest"
@@ -51,8 +51,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
-        - name: "gpu-2.4.0-2"
-          container: mosaicml/llm-foundry:2.4.0_cu124-latest
+        - name: "gpu-2.5.1-2"
+          container: mosaicml/llm-foundry:2.5.1_cu124-latest
           markers: "gpu"
           pip_deps: "[all]"
           pytest_command: "coverage run -m pytest"
@@ -80,8 +80,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
-        - name: "gpu-2.4.0-4"
-          container: mosaicml/llm-foundry:2.4.0_cu124-latest
+        - name: "gpu-2.5.1-4"
+          container: mosaicml/llm-foundry:2.5.1_cu124-latest
           markers: "gpu"
           pip_deps: "[all]"
           pytest_command: "coverage run -m pytest"

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -93,7 +93,7 @@ jobs:
           ${{ env.AWS_DOCKER_TAG }}
           ${{ env.AWS_LATEST_TAG }}
         build-args: |
-          BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
+          BASE_IMAGE=mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04-aws
           BRANCH_NAME=${{ env.BRANCH_NAME }}
           DEP_GROUPS=[all]
           KEEP_FOUNDRY=true
@@ -108,7 +108,7 @@ jobs:
           ${{ env.DOCKER_TAG }}
           ${{ env.LATEST_TAG }}
         build-args: |
-          BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+          BASE_IMAGE=mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04
           BRANCH_NAME=${{ env.BRANCH_NAME }}
           DEP_GROUPS=[all]
           KEEP_FOUNDRY=true
diff --git a/README.md b/README.md
@@ -113,24 +113,24 @@ If you have success/failure using LLM Foundry on other systems, please let us kn
 
 | Device         | Torch Version | Cuda Version | Status                       |
 | -------------- | ------------- | ------------ | ---------------------------- |
-| A100-40GB/80GB | 2.4.0         | 12.4         | :white_check_mark: Supported |
-| H100-80GB      | 2.4.0         | 12.4         | :white_check_mark: Supported |
+| A100-40GB/80GB | 2.5.1         | 12.4         | :white_check_mark: Supported |
+| H100-80GB      | 2.5.1         | 12.4         | :white_check_mark: Supported |
 
 ## MosaicML Docker Images
 We highly recommend using our prebuilt Docker images. You can find them here: https://hub.docker.com/orgs/mosaicml/repositories.
 
 The `mosaicml/pytorch` images are pinned to specific PyTorch and CUDA versions, and are stable and rarely updated.
 
 The `mosaicml/llm-foundry` images are built with new tags upon every commit to the `main` branch.
-You can select a specific commit hash such as `mosaicml/llm-foundry:2.4.0_cu124-36ab1ba` or take the latest one using `mosaicml/llm-foundry:2.4.0_cu124-latest`.
+You can select a specific commit hash such as `mosaicml/llm-foundry:2.5.1_cu124-9867a7b` or take the latest one using `mosaicml/llm-foundry:2.5.1_cu124-latest`.
 
 **Please Note:** The `mosaicml/llm-foundry` images do not come with the `llm-foundry` package preinstalled, just the dependencies. You will still need to `pip install llm-foundry` either from PyPi or from source.
 
 | Docker Image                                           | Torch Version | Cuda Version      | LLM Foundry dependencies installed? |
 | ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- |
-| `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04`  | 2.4.0         | 12.4 (Infiniband) | No                                  |
-| `mosaicml/llm-foundry:2.4.0_cu124-latest`              | 2.4.0         | 12.4 (Infiniband) | Yes                                 |
-| `mosaicml/llm-foundry:2.4.0_cu124_aws-latest`          | 2.4.0         | 12.4 (EFA)        | Yes                                 |
+| `mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04`  | 2.5.1         | 12.4 (Infiniband) | No                                  |
+| `mosaicml/llm-foundry:2.5.1_cu124-latest`              | 2.5.1         | 12.4 (Infiniband) | Yes                                 |
+| `mosaicml/llm-foundry:2.5.1_cu124_aws-latest`          | 2.5.1         | 12.4 (EFA)        | Yes                                 |
 
 
 # Installation

diff --git a/llmfoundry/_version.py b/llmfoundry/_version.py
@@ -3,4 +3,4 @@
 
 """The LLM Foundry Version."""
 
-__version__ = '0.15.0.dev0'
+__version__ = '0.16.0.dev0'
diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
@@ -743,7 +743,10 @@ def tensor_hook(
                 ) if is_te_imported and state.precision == Precision.AMP_FP8 else contextlib.nullcontext(
                 )
                 with context_manager:
-                    new_model_instance.save_pretrained(temp_save_dir)
+                    new_model_instance.save_pretrained(
+                        temp_save_dir,
+                        max_shard_size='1GB',
+                    )
                 if original_tokenizer is not None:
                     assert isinstance(
                         original_tokenizer,
@@ -781,6 +784,10 @@ def tensor_hook(
 
         if dist.get_global_rank() == 0:
             if register_to_mlflow:
+                assert new_model_instance is not None
+                new_model_instance = self.transform_model_pre_registration(
+                    new_model_instance,
+                )
                 if self.using_peft:
 
                     # Save and register peft model to mlflow, this code path uses our older two step logic
@@ -795,11 +802,10 @@ def tensor_hook(
                         temp_save_dir,
                         'register_save',
                     )
-                    assert new_model_instance is not None
-                    new_model_instance = self.transform_model_pre_registration(
-                        new_model_instance,
+                    new_model_instance.save_pretrained(
+                        register_save_dir,
+                        max_shard_size='1GB',
                     )
-                    new_model_instance.save_pretrained(register_save_dir)
                     if original_tokenizer:
                         original_tokenizer.save_pretrained(register_save_dir)
 
@@ -854,9 +860,6 @@ def _save_and_register_peft_model(
         original_tokenizer: Optional[Any],
         save_dir: str,
     ):
-        new_model_instance = self.transform_model_pre_registration(
-            new_model_instance,
-        )
         components = {'model': new_model_instance}
         if original_tokenizer is not None:
             components['tokenizer'] = original_tokenizer

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -1,6 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 import logging
 import os
 import re
@@ -26,6 +27,8 @@
     FailedToCreateSQLConnectionError,
     FaultyDataPrepCluster,
     InsufficientPermissionsError,
+    MalformedUCTableError,
+    StoragePermissionError,
     UCNotEnabledError,
 )
 
@@ -500,19 +503,18 @@ def fetch(
         from pyspark.errors import AnalysisException
 
         if isinstance(e, (AnalysisException, ServerOperationError)):
-            if 'INSUFFICIENT_PERMISSIONS' in str(e):
-                raise InsufficientPermissionsError(str(e)) from e
-            elif 'UC_NOT_ENABLED' in str(e):
+            error_message = str(e)
+            if 'INSUFFICIENT_PERMISSIONS' in error_message:
+                raise InsufficientPermissionsError(error_message) from e
+            elif 'UC_NOT_ENABLED' in error_message:
                 raise UCNotEnabledError() from e
-            elif 'DELTA_TABLE_NOT_FOUND' in str(e):
-                err_str = str(e)
-                # Error string should be in this format:
-                # ---
+            elif 'UNRESOLVED_COLUMN.WITH_SUGGESTION' in error_message:
+                raise MalformedUCTableError(error_message) from e
+            elif 'Delta table' in str(e) and "doesn't exist" in str(e):
                 # Error processing `catalog`.`volume_name`.`table_name`:
-                # [DELTA_TABLE_NOT_FOUND] Delta table `volume_name`.`table_name`
-                # doesn't exist.
+                # Delta table `volume_name`.`table_name` doesn't exist.
                 # ---
-                parts = err_str.split('`')
+                parts = error_message.split('`')
                 if len(parts) < 7:
                     # Failed to parse error, our codebase is brittle
                     # with respect to the string representations of
@@ -681,7 +683,7 @@ def fetch_DT(
 
     log.info(f'Directory {json_output_folder} created.')
 
-    # validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True
+    # Validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True.
     method, dbsql, sparkSession = validate_and_get_cluster_info(
         cluster_id=cluster_id,
         databricks_host=DATABRICKS_HOST,
@@ -704,6 +706,14 @@ def fetch_DT(
             dbsql,
         )
     except (grpc.RpcError, spark_errors.SparkConnectGrpcException) as e:
+        if isinstance(
+            e,
+            spark_errors.SparkConnectGrpcException,
+        ) and 'is not Shared or Single User Cluster' in str(e):
+            raise FaultyDataPrepCluster(
+                message=
+                f'The cluster you have provided: {cluster_id} does not have data governance enabled. Please use a cluster with a data security mode other than NONE. {e}',
+            ) from e
         if isinstance(
             e,
             spark_errors.SparkConnectGrpcException,
@@ -732,12 +742,38 @@ def fetch_DT(
     if dbsql is not None:
         dbsql.close()
 
-    # combine downloaded jsonl into one big jsonl for IFT
+    # Combine downloaded jsonl into one big jsonl for IFT.
     iterative_combine_jsons(
         json_output_folder,
         os.path.join(json_output_folder, json_output_filename),
     )
 
+    _validate_written_file(
+        json_output_folder,
+        json_output_filename,
+        delta_table_name,
+    )
+
+
+def _validate_written_file(
+    json_output_folder: str,
+    json_output_filename: str,
+    delta_table_name: str,
+):
+    # Validate downloaded dataset is actually downloaded.
+    with open(os.path.join(json_output_folder, json_output_filename)) as f:
+        is_empty = True
+        for line in f.readlines():
+            is_empty = False
+            try:
+                json.loads(line)
+            except Exception as e:
+                raise ValueError(f'Line is not valid json: {line}') from e
+        if is_empty:
+            raise StoragePermissionError(
+                f'Unable to download {delta_table_name}, check network permissions.',
+            )
+
 
 def _check_imports():
     try:

diff --git a/llmfoundry/data/contrastive_pairs/dataloader.py b/llmfoundry/data/contrastive_pairs/dataloader.py
@@ -13,6 +13,7 @@
 import numpy as np
 import torch
 from composer.core import DataSpec
+from composer.utils import retry
 from streaming import Stream, StreamingDataset
 from torch.utils.data import DataLoader
 from transformers import PreTrainedTokenizerBase
@@ -136,6 +137,7 @@ def _get_contrastive_samples(
             'negative': negative_responses,
         }
 
+    @retry(BlockingIOError, num_attempts=5, initial_backoff=1.0, max_jitter=0.5)
     def __getitem__(self, idx: int) -> dict[str, list[int]]:
         sample = StreamingDataset.__getitem__(self, idx)
         text_samples = []

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -34,7 +34,6 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 import importlib
 import logging
 import os
-import tempfile
 import warnings
 from collections.abc import Mapping
 from functools import partial
@@ -93,6 +92,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
     UnknownExampleTypeError,
 )
 #  yapf: enable
+from llmfoundry.utils.file_utils import dist_mkdtemp
 from llmfoundry.utils.logging_utils import SpecificWarningFilter
 
 log = logging.getLogger(__name__)
@@ -119,6 +119,15 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 ExampleType = Literal['prompt_response', 'chat']
 TokenizedExample = dict[str, list[dict[str, list[int]]]]
 
+_DEFAULT_CHAT_TEMPLATE = (
+    '{% for message in messages %}'
+    "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
+    '{% endfor %}'
+    '{% if add_generation_prompt %}'
+    "{{ '<|im_start|>assistant\n' }}"
+    '{% endif %}'
+)
+
 
 def _get_example_type(example: Example) -> ExampleType:
     """Determines the type of the input example.
@@ -243,17 +252,21 @@ def slice_out_last_turn(
         messages_through_current_turn: list[dict[str, str]],
         conversation_through_previous_turn: str,
     ) -> tuple[str, str]:
+        chat_template = None if tokenizer.chat_template is not None else _DEFAULT_CHAT_TEMPLATE
+
         try:
             full_conversation = tokenizer.apply_chat_template(
                 messages_through_current_turn,
                 tokenize=False,
                 date_string=get_date_string(),
+                chat_template=chat_template,
             )
             prompt_with_history = tokenizer.apply_chat_template(
                 messages_through_current_turn[:-1],
                 tokenize=False,
                 add_generation_prompt=True,
                 date_string=get_date_string(),
+                chat_template=chat_template,
             )
         except Exception as e:
             raise ChatTemplateError(
@@ -888,6 +901,8 @@ def build_from_hf(
 
         signal_file_path = dist.get_node_signal_file_name()
 
+        download_folder = dist_mkdtemp()
+
         # Non local rank 0 ranks will wait here for local rank 0 to finish the data processing.
         # Once local rank 0 is done, the datasets are all cached on disk, and all other ranks
         # can just read them.
@@ -913,7 +928,7 @@ def build_from_hf(
                 if not os.path.isdir(dataset_name):
                     # dataset_name is not a local dir path, download if needed.
                     local_dataset_dir = os.path.join(
-                        tempfile.mkdtemp(),
+                        download_folder,
                         dataset_name,
                     )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,4 @@

		"""The LLM Foundry Version."""

		__version__ = '0.15.0.dev0'
		__version__ = '0.16.0.dev0'