Merge branch 'main' into nancy/log-model

mosaicml · Oct 22, 2024 · bc29278 · bc29278
2 parents 1915042 + 21c7ec8
commit bc29278
Show file tree

Hide file tree

Showing 38 changed files with 412 additions and 246 deletions.
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -20,11 +20,9 @@ jobs:
         - name: "2.4.0_cu124"
           base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
           dep_groups: "[all]"
-          te_commit: 901e5d2
         - name: "2.4.0_cu124_aws"
           base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
           dep_groups: "[all]"
-          te_commit: 901e5d2
     steps:
 
     - name: Checkout
@@ -91,4 +89,3 @@ jobs:
           BRANCH_NAME=${{ github.head_ref || github.ref_name }}
           BASE_IMAGE=${{ matrix.base_image }}
           DEP_GROUPS=${{ matrix.dep_groups }}
-          TE_COMMIT=${{ matrix.te_commit }}
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -95,7 +95,6 @@ jobs:
         build-args: |
           BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
           BRANCH_NAME=${{ env.BRANCH_NAME }}
-          TE_COMMIT=901e5d2
           DEP_GROUPS=[all]
           KEEP_FOUNDRY=true
 
@@ -111,6 +110,5 @@ jobs:
         build-args: |
           BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
           BRANCH_NAME=${{ env.BRANCH_NAME }}
-          TE_COMMIT=901e5d2
           DEP_GROUPS=[all]
           KEEP_FOUNDRY=true
diff --git a/Dockerfile b/Dockerfile
@@ -6,7 +6,6 @@ FROM $BASE_IMAGE
 
 ARG BRANCH_NAME
 ARG DEP_GROUPS
-ARG TE_COMMIT
 ARG KEEP_FOUNDRY=false
 
 ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0"
@@ -16,9 +15,6 @@ ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0"
 ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py setup.py
 RUN rm setup.py
 
-# Install TransformerEngine
-RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@$TE_COMMIT
-
 # Install and uninstall foundry to cache foundry requirements
 RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git
 RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}"

diff --git a/llmfoundry/_version.py b/llmfoundry/_version.py
@@ -3,4 +3,4 @@
 
 """The LLM Foundry Version."""
 
-__version__ = '0.13.0.dev0'
+__version__ = '0.14.0.dev0'
diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
@@ -234,6 +234,7 @@ def __init__(
         mlflow_logging_config: Optional[dict] = None,
         flatten_imports: Sequence[str] = ('llmfoundry',),
         final_register_only: bool = False,
+        register_wait_seconds: int = 7200,
     ):
         _, _, self.save_dir_format_str = parse_uri(save_folder)
         self.overwrite = overwrite
@@ -247,6 +248,7 @@ def __init__(
         self.using_peft = False
 
         self.final_register_only = final_register_only
+        self.register_wait_seconds = register_wait_seconds
 
         self.mlflow_registered_model_name = mlflow_registered_model_name
         if self.final_register_only and self.mlflow_registered_model_name is None:
@@ -379,7 +381,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
             self.using_peft = composer_model.using_peft
         elif event == Event.FIT_END:
             # Wait for all child processes spawned by the callback to finish.
-            timeout = 3600
+            timeout = self.register_wait_seconds
             wait_start = time.time()
             while not self._all_register_processes_done(state.device):
                 wait_time = time.time() - wait_start

diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
@@ -451,6 +451,7 @@ def convert_dataset_hf_from_args(
         ValueError: If the output directory already contains the requested splits
         ValueError: If `concat_tokens` is set but `tokenizer` is not
     """
+    os.environ['WORLD_SIZE'] = '1'
     if tokenizer_kwargs:
         parsed_tokenizer_kwargs = json.loads(tokenizer_kwargs)
     else:

diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_json.py b/llmfoundry/command_utils/data_prep/convert_dataset_json.py
@@ -186,6 +186,7 @@ def convert_dataset_json_from_args(
         ValueError: If the out_root directory exists and contains files that overlap with the requested splits
         ValueError: If concat_tokens is set and a tokenizer is not provided
     """
+    os.environ['WORLD_SIZE'] = '1'
     if os.path.isdir(out_root) and len(
         set(os.listdir(out_root)).intersection(set(split)),
     ) > 0:

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -550,6 +550,9 @@ def validate_and_get_cluster_info(
         ).upper()[len('DATASECURITYMODE.'):]
 
         # NONE stands for No Isolation Shared
+        # This check actually checks for Unity Catalog governance compatibility and does not
+        # check for invalid cluster access for a particular user. Cluster access controls is
+        # difficult and there is no single existing API to check this.
         if data_security_mode == 'NONE':
             raise ClusterInvalidAccessMode(
                 cluster_id=cluster_id,
@@ -767,6 +770,7 @@ def convert_delta_to_json_from_args(
         use_serverless (bool): Use serverless or not. Make sure the workspace is entitled with serverless
         json_output_filename (str): The name of the combined final jsonl that combines all partitioned jsonl
     """
+    os.environ['WORLD_SIZE'] = '1'
     _check_imports()
     from databricks.sdk import WorkspaceClient
     w = WorkspaceClient()

diff --git a/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py b/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py
@@ -309,6 +309,7 @@ def convert_finetuning_dataset_from_args(
         ValueError: If the target settings are invalid.
         ValueError: If the output directory already contains the requested splits.
     """
+    os.environ['WORLD_SIZE'] = '1'
     if os.path.isdir(out_root) and len(
         set(os.listdir(out_root)).intersection(set(splits)),
     ) > 0:

diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
@@ -557,6 +557,7 @@ def convert_text_to_mds_from_args(
     Raises:
         ValueError: If `use_tokenizer_eos` is True and `eos_text` is not None
     """
+    os.environ['WORLD_SIZE'] = '1'
     if use_tokenizer_eos:
         # Ensure that eos text is not specified twice.
         if eos_text is not None:

diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
@@ -4,7 +4,6 @@
 import logging
 import os
 import time
-import warnings
 from typing import Any, Optional, Union
 
 import pandas as pd
@@ -37,7 +36,6 @@
     process_init_device,
 )
 from llmfoundry.utils.registry_utils import import_file
-from llmfoundry.utils.warnings import VersionedDeprecationWarning
 
 log = logging.getLogger(__name__)
 
@@ -63,7 +61,6 @@ def evaluate_model(
     callback_configs: Optional[dict[str, Any]],
     metadata: Optional[dict[str, str]],
     logged_config: dict[str, Any],
-    fsdp_config: Optional[dict[str, Any]] = None,
     parallelism_config: Optional[dict[str, Any]] = None,
     should_log_config: bool = True,
     load_path: Optional[str] = None,
@@ -78,18 +75,6 @@ def evaluate_model(
                     'parallelism_config cannot contain deprecated fsdp_config arguments.',
                 )
 
-    if fsdp_config:
-        warnings.warn(
-            VersionedDeprecationWarning(
-                'The argument fsdp_config is deprecated. Please use parallelism_config instead.',
-                remove_version='0.14.0',
-            ),
-        )
-    if fsdp_config and parallelism_config:
-        raise ValueError(
-            'Both fsdp_config and parallelism_config cannot be provided at the same time. Please use parallelism_config.',
-        )
-
     log.info(f'Evaluating model: {model_name}')
     # Build tokenizer and model
     tokenizer_cfg = tokenizer
@@ -125,9 +110,9 @@ def evaluate_model(
             mosaicml_logger._flush_metadata(force_flush=True)
 
     fsdp_config = parallelism_config.get(
-        'fsdp_config',
+        'fsdp',
         None,
-    ) if parallelism_config else fsdp_config
+    ) if parallelism_config else None
     if fsdp_config and model.get('load_in_8bit', False):
         raise ValueError(
             'The FSDP config block is not supported when loading ' +
@@ -175,7 +160,7 @@ def evaluate_model(
         callbacks=callbacks,
         loggers=loggers,
         precision=precision,
-        parallelism_config={'fsdp': fsdp_config},
+        parallelism_config=parallelism_config,
         load_path=load_path,
         load_weights_only=True,
         progress_bar=False,
@@ -268,8 +253,6 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]:
     model_configs = eval_config.models
     eval_gauntlet_config = eval_config.eval_gauntlet or eval_config.eval_gauntlet_str
 
-    fsdp_config = eval_config.fsdp_config
-
     # Mandatory Evaluation Parameters
     icl_tasks = eval_config.icl_tasks or eval_config.icl_tasks_str
     if icl_tasks is None:
@@ -345,9 +328,9 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]:
              device_eval_batch_size=eval_config.device_eval_batch_size,
              eval_gauntlet_config=eval_gauntlet_config,
              eval_loader_config=eval_loader_config,
-             fsdp_config=fsdp_config,
              loggers=loggers,
              python_log_level=eval_config.python_log_level,
+             parallelism_config={'fsdp': eval_config.fsdp_config},
              precision=eval_config.precision,
              eval_gauntlet_df=eval_gauntlet_df,
              callback_configs=eval_config.callbacks,

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
@@ -311,17 +311,21 @@ def train(cfg: DictConfig) -> Trainer:
     eval_gauntlet_config = train_cfg.eval_gauntlet or train_cfg.eval_gauntlet_str
 
     # Optional parameters will be set to default values if not specified.
-    default_run_name: str = os.environ.get('RUN_NAME', 'llm')
-    run_name: str = train_cfg.run_name if train_cfg.run_name else default_run_name
+    env_run_name: Optional[str] = os.environ.get('RUN_NAME', None)
+    run_name: str = (
+        train_cfg.run_name if train_cfg.run_name else env_run_name
+    ) or 'llm'
     is_state_dict_sharded: bool = (
         fsdp_config.get('state_dict_type', 'full') == 'sharded'
     ) if fsdp_config else False
     save_latest_filename: str = train_cfg.save_latest_filename if train_cfg.save_latest_filename else 'latest-sharded-rank{rank}' if is_state_dict_sharded else 'latest-rank{rank}.pt'
     save_filename: str = train_cfg.save_filename if train_cfg.save_filename else 'ep{epoch}-ba{batch}-rank{rank}.pt'
 
     # Enable autoresume from model checkpoints if possible
+    is_user_set_run_name: bool = train_cfg.run_name is not None or env_run_name is not None
     autoresume_default: bool = False
-    if train_cfg.save_folder is not None \
+    if is_user_set_run_name and \
+        train_cfg.save_folder is not None \
         and not train_cfg.save_overwrite \
         and not train_cfg.save_weights_only:
         autoresume_default = True
@@ -588,6 +592,8 @@ def train(cfg: DictConfig) -> Trainer:
         profiler=profiler,
         compile_config=compile_config,
         spin_dataloaders=train_cfg.spin_dataloaders,
+        accumulate_train_batch_on_tokens=train_cfg.
+        accumulate_train_batch_on_tokens,
     )
 
     _sort_callbacks(trainer)

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
@@ -198,7 +198,8 @@ def build_finetuning_dataloader(
     allowed_dataset_config_keys = set(
         dataset_constructor_keys,
     ).union(_ALLOWED_DATASET_KEYS)
-    _validate_config(
+
+    extraneous_keys = _validate_config(
         **dataset_cfg,
         allowed_dataset_keys=allowed_dataset_config_keys,
     )
@@ -253,13 +254,13 @@ def build_finetuning_dataloader(
             streams_cfg,
         ) if streams_cfg is not None else None
 
-        # Take the constructor args from above, minus args that have been created separately
         dataset_constructor_args = {
             k: v
             for k, v in dataset_cfg.items()
-            if k in dataset_constructor_keys and
-            k not in {'streams', 'packing_ratio'}
+            if k in set(dataset_constructor_keys).union(extraneous_keys) and
+            k not in {'streams', 'packing_ratio', 'replication'}
         }
+
         streaming_dataset = dataset_constructor.build_from_streaming(
             tokenizer=tokenizer,
             streams=streams,
@@ -366,7 +367,7 @@ def build_finetuning_dataloader(
 
 def _validate_config(
     max_seq_len: int,
-    decoder_only_format: bool = False,
+    decoder_only_format: Optional[bool] = None,
     hf_name: Optional[str] = None,
     local: Optional[str] = None,
     remote: Optional[str] = None,
@@ -378,7 +379,7 @@ def _validate_config(
     target_responses: Optional[str] = None,
     allowed_dataset_keys: set[str] = _ALLOWED_DATASET_KEYS,
     **kwargs: dict[str, Any],
-) -> None:
+) -> set[str]:
     """Validates the dataset configuration.
 
     Makes sure that the dataset is properly configured for either
@@ -389,7 +390,7 @@ def _validate_config(
         max_seq_len (int): The maximum length of sequences
             in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
             for details.
-        decoder_only_format (bool): Whether to format the
+        decoder_only_format (bool, optional): Whether to format the
             examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
             docstring for details.
         hf_name (str, optional): The name of the HuggingFace dataset
@@ -434,11 +435,21 @@ def _validate_config(
 
     Raises:
         ValueError: If the dataset configuration does not meet the requirements.
+
+    Returns:
+        set[str]: Return the extraneous keys.
     """
-    if not set(kwargs.keys()).issubset(allowed_dataset_keys):
+    if decoder_only_format is None:
         raise ValueError(
+            f'decoder_only_format must be set to either True or False, but it was {decoder_only_format}.',
+        )
+
+    extraneous_keys = set()
+    if not set(kwargs.keys()).issubset(allowed_dataset_keys):
+        extraneous_keys = set(kwargs.keys()) - allowed_dataset_keys
+        log.warning(
             'The dataset config contains the following extraneous keys: ' +\
-            ', '.join(set(kwargs.keys()) - allowed_dataset_keys),
+            ', '.join(extraneous_keys),
         )
 
     if hf_name is not None:
@@ -456,7 +467,7 @@ def _validate_config(
                 'Those keys are used when building from a streaming dataset, but ' +\
                 'setting `hf_name` instructs the dataset to build from a HuggingFace dataset.',
             )
-    elif remote is not None:
+    elif remote is not None or local is not None:
         # Using the streaming dataset codepath
         illegal_keys = {
             'hf_name': hf_name,
@@ -533,6 +544,8 @@ def _validate_config(
         decoder_only_format,
     )
 
+    return extraneous_keys
+
 
 def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
     """Downloads a dataset from a remote object store.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,4 @@

		"""The LLM Foundry Version."""

		__version__ = '0.13.0.dev0'
		__version__ = '0.14.0.dev0'