Skip to content

Commit

Permalink
Merge branch 'main' into nancy/log-model
Browse files Browse the repository at this point in the history
  • Loading branch information
nancyhung committed Oct 22, 2024
2 parents 1915042 + 21c7ec8 commit bc29278
Show file tree
Hide file tree
Showing 38 changed files with 412 additions and 246 deletions.
3 changes: 0 additions & 3 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,9 @@ jobs:
- name: "2.4.0_cu124"
base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
dep_groups: "[all]"
te_commit: 901e5d2
- name: "2.4.0_cu124_aws"
base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
dep_groups: "[all]"
te_commit: 901e5d2
steps:

- name: Checkout
Expand Down Expand Up @@ -91,4 +89,3 @@ jobs:
BRANCH_NAME=${{ github.head_ref || github.ref_name }}
BASE_IMAGE=${{ matrix.base_image }}
DEP_GROUPS=${{ matrix.dep_groups }}
TE_COMMIT=${{ matrix.te_commit }}
2 changes: 0 additions & 2 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ jobs:
build-args: |
BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
BRANCH_NAME=${{ env.BRANCH_NAME }}
TE_COMMIT=901e5d2
DEP_GROUPS=[all]
KEEP_FOUNDRY=true
Expand All @@ -111,6 +110,5 @@ jobs:
build-args: |
BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
BRANCH_NAME=${{ env.BRANCH_NAME }}
TE_COMMIT=901e5d2
DEP_GROUPS=[all]
KEEP_FOUNDRY=true
4 changes: 0 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ FROM $BASE_IMAGE

ARG BRANCH_NAME
ARG DEP_GROUPS
ARG TE_COMMIT
ARG KEEP_FOUNDRY=false

ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0"
Expand All @@ -16,9 +15,6 @@ ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0"
ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py setup.py
RUN rm setup.py

# Install TransformerEngine
RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@$TE_COMMIT

# Install and uninstall foundry to cache foundry requirements
RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git
RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}"
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

"""The LLM Foundry Version."""

__version__ = '0.13.0.dev0'
__version__ = '0.14.0.dev0'
4 changes: 3 additions & 1 deletion llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ def __init__(
mlflow_logging_config: Optional[dict] = None,
flatten_imports: Sequence[str] = ('llmfoundry',),
final_register_only: bool = False,
register_wait_seconds: int = 7200,
):
_, _, self.save_dir_format_str = parse_uri(save_folder)
self.overwrite = overwrite
Expand All @@ -247,6 +248,7 @@ def __init__(
self.using_peft = False

self.final_register_only = final_register_only
self.register_wait_seconds = register_wait_seconds

self.mlflow_registered_model_name = mlflow_registered_model_name
if self.final_register_only and self.mlflow_registered_model_name is None:
Expand Down Expand Up @@ -379,7 +381,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
self.using_peft = composer_model.using_peft
elif event == Event.FIT_END:
# Wait for all child processes spawned by the callback to finish.
timeout = 3600
timeout = self.register_wait_seconds
wait_start = time.time()
while not self._all_register_processes_done(state.device):
wait_time = time.time() - wait_start
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/command_utils/data_prep/convert_dataset_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@ def convert_dataset_hf_from_args(
ValueError: If the output directory already contains the requested splits
ValueError: If `concat_tokens` is set but `tokenizer` is not
"""
os.environ['WORLD_SIZE'] = '1'
if tokenizer_kwargs:
parsed_tokenizer_kwargs = json.loads(tokenizer_kwargs)
else:
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/command_utils/data_prep/convert_dataset_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ def convert_dataset_json_from_args(
ValueError: If the out_root directory exists and contains files that overlap with the requested splits
ValueError: If concat_tokens is set and a tokenizer is not provided
"""
os.environ['WORLD_SIZE'] = '1'
if os.path.isdir(out_root) and len(
set(os.listdir(out_root)).intersection(set(split)),
) > 0:
Expand Down
4 changes: 4 additions & 0 deletions llmfoundry/command_utils/data_prep/convert_delta_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,9 @@ def validate_and_get_cluster_info(
).upper()[len('DATASECURITYMODE.'):]

# NONE stands for No Isolation Shared
# This check actually checks for Unity Catalog governance compatibility and does not
# check for invalid cluster access for a particular user. Cluster access controls is
# difficult and there is no single existing API to check this.
if data_security_mode == 'NONE':
raise ClusterInvalidAccessMode(
cluster_id=cluster_id,
Expand Down Expand Up @@ -767,6 +770,7 @@ def convert_delta_to_json_from_args(
use_serverless (bool): Use serverless or not. Make sure the workspace is entitled with serverless
json_output_filename (str): The name of the combined final jsonl that combines all partitioned jsonl
"""
os.environ['WORLD_SIZE'] = '1'
_check_imports()
from databricks.sdk import WorkspaceClient
w = WorkspaceClient()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ def convert_finetuning_dataset_from_args(
ValueError: If the target settings are invalid.
ValueError: If the output directory already contains the requested splits.
"""
os.environ['WORLD_SIZE'] = '1'
if os.path.isdir(out_root) and len(
set(os.listdir(out_root)).intersection(set(splits)),
) > 0:
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/command_utils/data_prep/convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,7 @@ def convert_text_to_mds_from_args(
Raises:
ValueError: If `use_tokenizer_eos` is True and `eos_text` is not None
"""
os.environ['WORLD_SIZE'] = '1'
if use_tokenizer_eos:
# Ensure that eos text is not specified twice.
if eos_text is not None:
Expand Down
25 changes: 4 additions & 21 deletions llmfoundry/command_utils/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import logging
import os
import time
import warnings
from typing import Any, Optional, Union

import pandas as pd
Expand Down Expand Up @@ -37,7 +36,6 @@
process_init_device,
)
from llmfoundry.utils.registry_utils import import_file
from llmfoundry.utils.warnings import VersionedDeprecationWarning

log = logging.getLogger(__name__)

Expand All @@ -63,7 +61,6 @@ def evaluate_model(
callback_configs: Optional[dict[str, Any]],
metadata: Optional[dict[str, str]],
logged_config: dict[str, Any],
fsdp_config: Optional[dict[str, Any]] = None,
parallelism_config: Optional[dict[str, Any]] = None,
should_log_config: bool = True,
load_path: Optional[str] = None,
Expand All @@ -78,18 +75,6 @@ def evaluate_model(
'parallelism_config cannot contain deprecated fsdp_config arguments.',
)

if fsdp_config:
warnings.warn(
VersionedDeprecationWarning(
'The argument fsdp_config is deprecated. Please use parallelism_config instead.',
remove_version='0.14.0',
),
)
if fsdp_config and parallelism_config:
raise ValueError(
'Both fsdp_config and parallelism_config cannot be provided at the same time. Please use parallelism_config.',
)

log.info(f'Evaluating model: {model_name}')
# Build tokenizer and model
tokenizer_cfg = tokenizer
Expand Down Expand Up @@ -125,9 +110,9 @@ def evaluate_model(
mosaicml_logger._flush_metadata(force_flush=True)

fsdp_config = parallelism_config.get(
'fsdp_config',
'fsdp',
None,
) if parallelism_config else fsdp_config
) if parallelism_config else None
if fsdp_config and model.get('load_in_8bit', False):
raise ValueError(
'The FSDP config block is not supported when loading ' +
Expand Down Expand Up @@ -175,7 +160,7 @@ def evaluate_model(
callbacks=callbacks,
loggers=loggers,
precision=precision,
parallelism_config={'fsdp': fsdp_config},
parallelism_config=parallelism_config,
load_path=load_path,
load_weights_only=True,
progress_bar=False,
Expand Down Expand Up @@ -268,8 +253,6 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]:
model_configs = eval_config.models
eval_gauntlet_config = eval_config.eval_gauntlet or eval_config.eval_gauntlet_str

fsdp_config = eval_config.fsdp_config

# Mandatory Evaluation Parameters
icl_tasks = eval_config.icl_tasks or eval_config.icl_tasks_str
if icl_tasks is None:
Expand Down Expand Up @@ -345,9 +328,9 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]:
device_eval_batch_size=eval_config.device_eval_batch_size,
eval_gauntlet_config=eval_gauntlet_config,
eval_loader_config=eval_loader_config,
fsdp_config=fsdp_config,
loggers=loggers,
python_log_level=eval_config.python_log_level,
parallelism_config={'fsdp': eval_config.fsdp_config},
precision=eval_config.precision,
eval_gauntlet_df=eval_gauntlet_df,
callback_configs=eval_config.callbacks,
Expand Down
12 changes: 9 additions & 3 deletions llmfoundry/command_utils/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,17 +311,21 @@ def train(cfg: DictConfig) -> Trainer:
eval_gauntlet_config = train_cfg.eval_gauntlet or train_cfg.eval_gauntlet_str

# Optional parameters will be set to default values if not specified.
default_run_name: str = os.environ.get('RUN_NAME', 'llm')
run_name: str = train_cfg.run_name if train_cfg.run_name else default_run_name
env_run_name: Optional[str] = os.environ.get('RUN_NAME', None)
run_name: str = (
train_cfg.run_name if train_cfg.run_name else env_run_name
) or 'llm'
is_state_dict_sharded: bool = (
fsdp_config.get('state_dict_type', 'full') == 'sharded'
) if fsdp_config else False
save_latest_filename: str = train_cfg.save_latest_filename if train_cfg.save_latest_filename else 'latest-sharded-rank{rank}' if is_state_dict_sharded else 'latest-rank{rank}.pt'
save_filename: str = train_cfg.save_filename if train_cfg.save_filename else 'ep{epoch}-ba{batch}-rank{rank}.pt'

# Enable autoresume from model checkpoints if possible
is_user_set_run_name: bool = train_cfg.run_name is not None or env_run_name is not None
autoresume_default: bool = False
if train_cfg.save_folder is not None \
if is_user_set_run_name and \
train_cfg.save_folder is not None \
and not train_cfg.save_overwrite \
and not train_cfg.save_weights_only:
autoresume_default = True
Expand Down Expand Up @@ -588,6 +592,8 @@ def train(cfg: DictConfig) -> Trainer:
profiler=profiler,
compile_config=compile_config,
spin_dataloaders=train_cfg.spin_dataloaders,
accumulate_train_batch_on_tokens=train_cfg.
accumulate_train_batch_on_tokens,
)

_sort_callbacks(trainer)
Expand Down
33 changes: 23 additions & 10 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,8 @@ def build_finetuning_dataloader(
allowed_dataset_config_keys = set(
dataset_constructor_keys,
).union(_ALLOWED_DATASET_KEYS)
_validate_config(

extraneous_keys = _validate_config(
**dataset_cfg,
allowed_dataset_keys=allowed_dataset_config_keys,
)
Expand Down Expand Up @@ -253,13 +254,13 @@ def build_finetuning_dataloader(
streams_cfg,
) if streams_cfg is not None else None

# Take the constructor args from above, minus args that have been created separately
dataset_constructor_args = {
k: v
for k, v in dataset_cfg.items()
if k in dataset_constructor_keys and
k not in {'streams', 'packing_ratio'}
if k in set(dataset_constructor_keys).union(extraneous_keys) and
k not in {'streams', 'packing_ratio', 'replication'}
}

streaming_dataset = dataset_constructor.build_from_streaming(
tokenizer=tokenizer,
streams=streams,
Expand Down Expand Up @@ -366,7 +367,7 @@ def build_finetuning_dataloader(

def _validate_config(
max_seq_len: int,
decoder_only_format: bool = False,
decoder_only_format: Optional[bool] = None,
hf_name: Optional[str] = None,
local: Optional[str] = None,
remote: Optional[str] = None,
Expand All @@ -378,7 +379,7 @@ def _validate_config(
target_responses: Optional[str] = None,
allowed_dataset_keys: set[str] = _ALLOWED_DATASET_KEYS,
**kwargs: dict[str, Any],
) -> None:
) -> set[str]:
"""Validates the dataset configuration.
Makes sure that the dataset is properly configured for either
Expand All @@ -389,7 +390,7 @@ def _validate_config(
max_seq_len (int): The maximum length of sequences
in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
for details.
decoder_only_format (bool): Whether to format the
decoder_only_format (bool, optional): Whether to format the
examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
docstring for details.
hf_name (str, optional): The name of the HuggingFace dataset
Expand Down Expand Up @@ -434,11 +435,21 @@ def _validate_config(
Raises:
ValueError: If the dataset configuration does not meet the requirements.
Returns:
set[str]: Return the extraneous keys.
"""
if not set(kwargs.keys()).issubset(allowed_dataset_keys):
if decoder_only_format is None:
raise ValueError(
f'decoder_only_format must be set to either True or False, but it was {decoder_only_format}.',
)

extraneous_keys = set()
if not set(kwargs.keys()).issubset(allowed_dataset_keys):
extraneous_keys = set(kwargs.keys()) - allowed_dataset_keys
log.warning(
'The dataset config contains the following extraneous keys: ' +\
', '.join(set(kwargs.keys()) - allowed_dataset_keys),
', '.join(extraneous_keys),
)

if hf_name is not None:
Expand All @@ -456,7 +467,7 @@ def _validate_config(
'Those keys are used when building from a streaming dataset, but ' +\
'setting `hf_name` instructs the dataset to build from a HuggingFace dataset.',
)
elif remote is not None:
elif remote is not None or local is not None:
# Using the streaming dataset codepath
illegal_keys = {
'hf_name': hf_name,
Expand Down Expand Up @@ -533,6 +544,8 @@ def _validate_config(
decoder_only_format,
)

return extraneous_keys


def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
"""Downloads a dataset from a remote object store.
Expand Down
Loading

0 comments on commit bc29278

Please sign in to comment.