Skip to content

Commit

Permalink
Merge branch 'main' into seed-sampler
Browse files Browse the repository at this point in the history
  • Loading branch information
mvpatel2000 authored Nov 29, 2024
2 parents 68c0d6c + 41e02d7 commit 74abd92
Show file tree
Hide file tree
Showing 39 changed files with 592 additions and 251 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ jobs:
strategy:
matrix:
include:
- name: "2.4.0_cu124"
base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
- name: "2.5.1_cu124"
base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04
dep_groups: "[all]"
- name: "2.4.0_cu124_aws"
base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
- name: "2.5.1_cu124_aws"
base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04-aws
dep_groups: "[all]"
steps:

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ jobs:
strategy:
matrix:
include:
- name: "cpu-2.4.0"
- name: "cpu-2.5.1"
pip_deps: "[all-cpu]"
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.5.1_cpu-python3.11-ubuntu22.04
markers: "not gpu"
pytest_command: "coverage run -m pytest"
steps:
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ jobs:
fail-fast: false
matrix:
include:
- name: "gpu-2.4.0-1"
container: mosaicml/llm-foundry:2.4.0_cu124-latest
- name: "gpu-2.5.1-1"
container: mosaicml/llm-foundry:2.5.1_cu124-latest
markers: "gpu"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
Expand Down Expand Up @@ -51,8 +51,8 @@ jobs:
fail-fast: false
matrix:
include:
- name: "gpu-2.4.0-2"
container: mosaicml/llm-foundry:2.4.0_cu124-latest
- name: "gpu-2.5.1-2"
container: mosaicml/llm-foundry:2.5.1_cu124-latest
markers: "gpu"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
Expand Down Expand Up @@ -80,8 +80,8 @@ jobs:
fail-fast: false
matrix:
include:
- name: "gpu-2.4.0-4"
container: mosaicml/llm-foundry:2.4.0_cu124-latest
- name: "gpu-2.5.1-4"
container: mosaicml/llm-foundry:2.5.1_cu124-latest
markers: "gpu"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ jobs:
${{ env.AWS_DOCKER_TAG }}
${{ env.AWS_LATEST_TAG }}
build-args: |
BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
BASE_IMAGE=mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04-aws
BRANCH_NAME=${{ env.BRANCH_NAME }}
DEP_GROUPS=[all]
KEEP_FOUNDRY=true
Expand All @@ -108,7 +108,7 @@ jobs:
${{ env.DOCKER_TAG }}
${{ env.LATEST_TAG }}
build-args: |
BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
BASE_IMAGE=mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04
BRANCH_NAME=${{ env.BRANCH_NAME }}
DEP_GROUPS=[all]
KEEP_FOUNDRY=true
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,24 +113,24 @@ If you have success/failure using LLM Foundry on other systems, please let us kn

| Device | Torch Version | Cuda Version | Status |
| -------------- | ------------- | ------------ | ---------------------------- |
| A100-40GB/80GB | 2.4.0 | 12.4 | :white_check_mark: Supported |
| H100-80GB | 2.4.0 | 12.4 | :white_check_mark: Supported |
| A100-40GB/80GB | 2.5.1 | 12.4 | :white_check_mark: Supported |
| H100-80GB | 2.5.1 | 12.4 | :white_check_mark: Supported |

## MosaicML Docker Images
We highly recommend using our prebuilt Docker images. You can find them here: https://hub.docker.com/orgs/mosaicml/repositories.

The `mosaicml/pytorch` images are pinned to specific PyTorch and CUDA versions, and are stable and rarely updated.

The `mosaicml/llm-foundry` images are built with new tags upon every commit to the `main` branch.
You can select a specific commit hash such as `mosaicml/llm-foundry:2.4.0_cu124-36ab1ba` or take the latest one using `mosaicml/llm-foundry:2.4.0_cu124-latest`.
You can select a specific commit hash such as `mosaicml/llm-foundry:2.5.1_cu124-9867a7b` or take the latest one using `mosaicml/llm-foundry:2.5.1_cu124-latest`.

**Please Note:** The `mosaicml/llm-foundry` images do not come with the `llm-foundry` package preinstalled, just the dependencies. You will still need to `pip install llm-foundry` either from PyPi or from source.

| Docker Image | Torch Version | Cuda Version | LLM Foundry dependencies installed? |
| ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- |
| `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04` | 2.4.0 | 12.4 (Infiniband) | No |
| `mosaicml/llm-foundry:2.4.0_cu124-latest` | 2.4.0 | 12.4 (Infiniband) | Yes |
| `mosaicml/llm-foundry:2.4.0_cu124_aws-latest` | 2.4.0 | 12.4 (EFA) | Yes |
| `mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04` | 2.5.1 | 12.4 (Infiniband) | No |
| `mosaicml/llm-foundry:2.5.1_cu124-latest` | 2.5.1 | 12.4 (Infiniband) | Yes |
| `mosaicml/llm-foundry:2.5.1_cu124_aws-latest` | 2.5.1 | 12.4 (EFA) | Yes |


# Installation
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

"""The LLM Foundry Version."""

__version__ = '0.15.0.dev0'
__version__ = '0.16.0.dev0'
19 changes: 11 additions & 8 deletions llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,10 @@ def tensor_hook(
) if is_te_imported and state.precision == Precision.AMP_FP8 else contextlib.nullcontext(
)
with context_manager:
new_model_instance.save_pretrained(temp_save_dir)
new_model_instance.save_pretrained(
temp_save_dir,
max_shard_size='1GB',
)
if original_tokenizer is not None:
assert isinstance(
original_tokenizer,
Expand Down Expand Up @@ -781,6 +784,10 @@ def tensor_hook(

if dist.get_global_rank() == 0:
if register_to_mlflow:
assert new_model_instance is not None
new_model_instance = self.transform_model_pre_registration(
new_model_instance,
)
if self.using_peft:

# Save and register peft model to mlflow, this code path uses our older two step logic
Expand All @@ -795,11 +802,10 @@ def tensor_hook(
temp_save_dir,
'register_save',
)
assert new_model_instance is not None
new_model_instance = self.transform_model_pre_registration(
new_model_instance,
new_model_instance.save_pretrained(
register_save_dir,
max_shard_size='1GB',
)
new_model_instance.save_pretrained(register_save_dir)
if original_tokenizer:
original_tokenizer.save_pretrained(register_save_dir)

Expand Down Expand Up @@ -854,9 +860,6 @@ def _save_and_register_peft_model(
original_tokenizer: Optional[Any],
save_dir: str,
):
new_model_instance = self.transform_model_pre_registration(
new_model_instance,
)
components = {'model': new_model_instance}
if original_tokenizer is not None:
components['tokenizer'] = original_tokenizer
Expand Down
60 changes: 48 additions & 12 deletions llmfoundry/command_utils/data_prep/convert_delta_to_json.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

import json
import logging
import os
import re
Expand All @@ -26,6 +27,8 @@
FailedToCreateSQLConnectionError,
FaultyDataPrepCluster,
InsufficientPermissionsError,
MalformedUCTableError,
StoragePermissionError,
UCNotEnabledError,
)

Expand Down Expand Up @@ -500,19 +503,18 @@ def fetch(
from pyspark.errors import AnalysisException

if isinstance(e, (AnalysisException, ServerOperationError)):
if 'INSUFFICIENT_PERMISSIONS' in str(e):
raise InsufficientPermissionsError(str(e)) from e
elif 'UC_NOT_ENABLED' in str(e):
error_message = str(e)
if 'INSUFFICIENT_PERMISSIONS' in error_message:
raise InsufficientPermissionsError(error_message) from e
elif 'UC_NOT_ENABLED' in error_message:
raise UCNotEnabledError() from e
elif 'DELTA_TABLE_NOT_FOUND' in str(e):
err_str = str(e)
# Error string should be in this format:
# ---
elif 'UNRESOLVED_COLUMN.WITH_SUGGESTION' in error_message:
raise MalformedUCTableError(error_message) from e
elif 'Delta table' in str(e) and "doesn't exist" in str(e):
# Error processing `catalog`.`volume_name`.`table_name`:
# [DELTA_TABLE_NOT_FOUND] Delta table `volume_name`.`table_name`
# doesn't exist.
# Delta table `volume_name`.`table_name` doesn't exist.
# ---
parts = err_str.split('`')
parts = error_message.split('`')
if len(parts) < 7:
# Failed to parse error, our codebase is brittle
# with respect to the string representations of
Expand Down Expand Up @@ -681,7 +683,7 @@ def fetch_DT(

log.info(f'Directory {json_output_folder} created.')

# validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True
# Validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True.
method, dbsql, sparkSession = validate_and_get_cluster_info(
cluster_id=cluster_id,
databricks_host=DATABRICKS_HOST,
Expand All @@ -704,6 +706,14 @@ def fetch_DT(
dbsql,
)
except (grpc.RpcError, spark_errors.SparkConnectGrpcException) as e:
if isinstance(
e,
spark_errors.SparkConnectGrpcException,
) and 'is not Shared or Single User Cluster' in str(e):
raise FaultyDataPrepCluster(
message=
f'The cluster you have provided: {cluster_id} does not have data governance enabled. Please use a cluster with a data security mode other than NONE. {e}',
) from e
if isinstance(
e,
spark_errors.SparkConnectGrpcException,
Expand Down Expand Up @@ -732,12 +742,38 @@ def fetch_DT(
if dbsql is not None:
dbsql.close()

# combine downloaded jsonl into one big jsonl for IFT
# Combine downloaded jsonl into one big jsonl for IFT.
iterative_combine_jsons(
json_output_folder,
os.path.join(json_output_folder, json_output_filename),
)

_validate_written_file(
json_output_folder,
json_output_filename,
delta_table_name,
)


def _validate_written_file(
json_output_folder: str,
json_output_filename: str,
delta_table_name: str,
):
# Validate downloaded dataset is actually downloaded.
with open(os.path.join(json_output_folder, json_output_filename)) as f:
is_empty = True
for line in f.readlines():
is_empty = False
try:
json.loads(line)
except Exception as e:
raise ValueError(f'Line is not valid json: {line}') from e
if is_empty:
raise StoragePermissionError(
f'Unable to download {delta_table_name}, check network permissions.',
)


def _check_imports():
try:
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/data/contrastive_pairs/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import numpy as np
import torch
from composer.core import DataSpec
from composer.utils import retry
from streaming import Stream, StreamingDataset
from torch.utils.data import DataLoader
from transformers import PreTrainedTokenizerBase
Expand Down Expand Up @@ -136,6 +137,7 @@ def _get_contrastive_samples(
'negative': negative_responses,
}

@retry(BlockingIOError, num_attempts=5, initial_backoff=1.0, max_jitter=0.5)
def __getitem__(self, idx: int) -> dict[str, list[int]]:
sample = StreamingDataset.__getitem__(self, idx)
text_samples = []
Expand Down
19 changes: 17 additions & 2 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
import importlib
import logging
import os
import tempfile
import warnings
from collections.abc import Mapping
from functools import partial
Expand Down Expand Up @@ -93,6 +92,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
UnknownExampleTypeError,
)
# yapf: enable
from llmfoundry.utils.file_utils import dist_mkdtemp
from llmfoundry.utils.logging_utils import SpecificWarningFilter

log = logging.getLogger(__name__)
Expand All @@ -119,6 +119,15 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
ExampleType = Literal['prompt_response', 'chat']
TokenizedExample = dict[str, list[dict[str, list[int]]]]

_DEFAULT_CHAT_TEMPLATE = (
'{% for message in messages %}'
"{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
'{% endfor %}'
'{% if add_generation_prompt %}'
"{{ '<|im_start|>assistant\n' }}"
'{% endif %}'
)


def _get_example_type(example: Example) -> ExampleType:
"""Determines the type of the input example.
Expand Down Expand Up @@ -243,17 +252,21 @@ def slice_out_last_turn(
messages_through_current_turn: list[dict[str, str]],
conversation_through_previous_turn: str,
) -> tuple[str, str]:
chat_template = None if tokenizer.chat_template is not None else _DEFAULT_CHAT_TEMPLATE

try:
full_conversation = tokenizer.apply_chat_template(
messages_through_current_turn,
tokenize=False,
date_string=get_date_string(),
chat_template=chat_template,
)
prompt_with_history = tokenizer.apply_chat_template(
messages_through_current_turn[:-1],
tokenize=False,
add_generation_prompt=True,
date_string=get_date_string(),
chat_template=chat_template,
)
except Exception as e:
raise ChatTemplateError(
Expand Down Expand Up @@ -888,6 +901,8 @@ def build_from_hf(

signal_file_path = dist.get_node_signal_file_name()

download_folder = dist_mkdtemp()

# Non local rank 0 ranks will wait here for local rank 0 to finish the data processing.
# Once local rank 0 is done, the datasets are all cached on disk, and all other ranks
# can just read them.
Expand All @@ -913,7 +928,7 @@ def build_from_hf(
if not os.path.isdir(dataset_name):
# dataset_name is not a local dir path, download if needed.
local_dataset_dir = os.path.join(
tempfile.mkdtemp(),
download_folder,
dataset_name,
)

Expand Down
Loading

0 comments on commit 74abd92

Please sign in to comment.