Merge branch 'main' into eval_script_model

mosaicml · Jul 18, 2024 · 5f07eae · 5f07eae
2 parents 9465603 + acb5530
commit 5f07eae
Show file tree

Hide file tree

Showing 39 changed files with 2,573 additions and 1,715 deletions.
diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py
@@ -1,29 +1,53 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Optional
+from typing import Annotated, Optional
 
-import typer
+from typer import Argument, Typer
 
-from llmfoundry.cli import registry_cli
-from llmfoundry.train import train_from_yaml
+from llmfoundry.cli import (
+    data_prep_cli,
+    registry_cli,
+)
+from llmfoundry.command_utils import (
+    eval_from_yaml,
+    train_from_yaml,
+)
 
-app = typer.Typer(pretty_exceptions_show_locals=False)
+app = Typer(pretty_exceptions_show_locals=False)
 app.add_typer(registry_cli.app, name='registry')
+app.add_typer(data_prep_cli.app, name='data_prep')
 
 
 @app.command(name='train')
 def train(
-    yaml_path: str = typer.Argument(
-        ...,
-        help='Path to the YAML configuration file',
-    ),  # type: ignore
-    args_list: Optional[list[str]] = typer.
-    Argument(None, help='Additional command line arguments'),  # type: ignore
+    yaml_path: Annotated[str,
+                         Argument(
+                             ...,
+                             help='Path to the YAML configuration file',
+                         )],
+    args_list: Annotated[
+        Optional[list[str]],
+        Argument(help='Additional command line arguments')] = None,
 ):
     """Run the training with optional overrides from CLI."""
     train_from_yaml(yaml_path, args_list)
 
 
+@app.command(name='eval')
+def eval(
+    yaml_path: Annotated[str,
+                         Argument(
+                             ...,
+                             help='Path to the YAML configuration file',
+                         )],
+    args_list: Annotated[
+        Optional[list[str]],
+        Argument(help='Additional command line arguments')] = None,
+):
+    """Run the eval with optional overrides from CLI."""
+    eval_from_yaml(yaml_path, args_list)
+
+
 if __name__ == '__main__':
     app()
diff --git a/llmfoundry/cli/data_prep_cli.py b/llmfoundry/cli/data_prep_cli.py
@@ -0,0 +1,150 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Annotated, Optional
+
+import psutil
+from typer import Option, Typer
+
+from llmfoundry.command_utils import (
+    convert_dataset_hf_from_args,
+    convert_dataset_json_from_args,
+    convert_text_to_mds_from_args,
+)
+
+app = Typer(pretty_exceptions_show_locals=False)
+
+
+@app.command(name='convert_dataset_hf')
+def convert_dataset_hf(
+    dataset: Annotated[str, Option(..., help='Name of the dataset')],
+    out_root: Annotated[str, Option(..., help='Output root directory')],
+    data_subset: Annotated[
+        Optional[str],
+        Option(help='Subset of the dataset (e.g., "all" or "en")'),
+    ] = None,
+    splits: Annotated[str,
+                      Option(help='Comma-separated list of dataset splits',),
+                     ] = 'train, train_small, val, val_small, val_xsmall',
+    compression: Annotated[Optional[str],
+                           Option(help='Compression type')] = None,
+    concat_tokens: Annotated[
+        Optional[int],
+        Option(help='Concatenate tokens up to this many tokens')] = None,
+    tokenizer: Annotated[Optional[str],
+                         Option(help='Tokenizer name')] = None,
+    tokenizer_kwargs: Annotated[
+        Optional[str],
+        Option(help='Tokenizer keyword arguments in JSON format')] = None,
+    bos_text: Annotated[Optional[str], Option(help='BOS text')] = None,
+    eos_text: Annotated[Optional[str], Option(help='EOS text')] = None,
+    no_wrap: Annotated[
+        bool,
+        Option(help='Do not wrap text across max_length boundaries'),
+    ] = False,
+    num_workers: Annotated[Optional[int],
+                           Option(help='Number of workers')] = None,
+):
+    """Converts dataset from HuggingFace into JSON files."""
+    # Convert comma-separated splits into a list
+    splits_list = splits.split(',') if splits else []
+    convert_dataset_hf_from_args(
+        dataset=dataset,
+        data_subset=data_subset,
+        splits=splits_list,
+        out_root=out_root,
+        compression=compression,
+        concat_tokens=concat_tokens,
+        tokenizer=tokenizer,
+        tokenizer_kwargs=tokenizer_kwargs,
+        bos_text=bos_text,
+        eos_text=eos_text,
+        no_wrap=no_wrap,
+        num_workers=num_workers,
+    )
+
+
+@app.command(name='convert_dataset_json')
+def convert_dataset_json(
+    path: Annotated[str, Option(..., help='Path to the input data file')],
+    out_root: Annotated[str, Option(..., help='Output root directory')],
+    concat_tokens: Annotated[
+        int,
+        Option(
+            ...,
+            help='Convert text to tokens and concatenate up to this many tokens',
+        )],
+    tokenizer: Annotated[str, Option(..., help='Tokenizer name')],
+    compression: Annotated[Optional[str],
+                           Option(help='Compression type, if any')] = 'zstd',
+    split: Annotated[str, Option(help='Dataset split to process')] = 'train',
+    bos_text: Annotated[
+        Optional[str],
+        Option(help='Text to insert at the beginning of each sequence')] = None,
+    eos_text: Annotated[
+        Optional[str],
+        Option(help='Text to insert at the end of each sequence')] = None,
+    no_wrap: Annotated[
+        bool,
+        Option(help='Do not wrap text across max_length boundaries')] = False,
+    num_workers: Annotated[
+        Optional[int],
+        Option(help='Number of workers for data loading')] = None,
+):
+    """Convert a dataset from JSON to MDS streaming format."""
+    convert_dataset_json_from_args(
+        path=path,
+        split=split,
+        out_root=out_root,
+        compression=compression,
+        concat_tokens=concat_tokens,
+        tokenizer=tokenizer,
+        bos_text=bos_text,
+        eos_text=eos_text,
+        no_wrap=no_wrap,
+        num_workers=num_workers,
+    )
+
+
+@app.command(name='convert_text_to_mds')
+def convert_text_to_mds(
+    output_folder: Annotated[str, Option(..., help='The folder to write output to')],
+    input_folder: Annotated[str, Option(..., help='The folder with text files to convert to MDS')],
+    concat_tokens: Annotated[int, Option(..., help='Convert text to tokens and concatenate up to this many tokens')],
+    tokenizer: Annotated[str, Option(..., help='The name of the tokenizer to use')],
+    bos_text: Annotated[Optional[str], Option(help='The text to prepend to each example to separate concatenated examples')] = None,
+    eos_text: Annotated[Optional[str], Option(help='The text to append to each example to separate concatenated examples')] = None,
+    compression: Annotated[str, Option(help='The compression algorithm to use for MDS writing')] = 'zstd',
+    use_tokenizer_eos: Annotated[bool, Option(help='Use the EOS text from the tokenizer')] = False,
+    no_wrap: Annotated[bool, Option(help='Whether to let text examples wrap across multiple training examples')] = False,
+    processes: Annotated[int, Option(
+        help='The number of processes to use to download and convert the dataset',
+    )] = min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore
+    reprocess: Annotated[bool, Option(
+        help=
+        'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.',
+    )] = False,
+    trust_remote_code: Annotated[bool, Option(
+        help='If true, allows custom code to be executed to load the tokenizer',
+    )] = False,
+    logging_level: Annotated[str, Option(
+        help='Logging level for the script. Default is INFO.',
+    )] = 'INFO',
+
+):
+    """Convert text files to MDS streaming format."""
+    convert_text_to_mds_from_args(
+        output_folder=output_folder,
+        input_folder=input_folder,
+        compression=compression,
+        concat_tokens=concat_tokens,
+        tokenizer_name=tokenizer,
+        bos_text=bos_text,
+        eos_text=eos_text,
+        use_tokenizer_eos=use_tokenizer_eos,
+        no_wrap=no_wrap,
+        processes=processes,
+        reprocess=reprocess,
+        trust_remote_code=trust_remote_code,
+        logging_level=logging_level,
+    )
diff --git a/llmfoundry/cli/registry_cli.py b/llmfoundry/cli/registry_cli.py
@@ -3,15 +3,15 @@
 
 from typing import Optional
 
-import typer
 from rich.console import Console
 from rich.table import Table
+from typer import Typer
 
 from llmfoundry import registry
 from llmfoundry.utils.registry_utils import TypedRegistry
 
 console = Console()
-app = typer.Typer(pretty_exceptions_show_locals=False)
+app = Typer(pretty_exceptions_show_locals=False)
 
 
 def _get_registries(group: Optional[str] = None) -> list[TypedRegistry]:

diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py
@@ -0,0 +1,41 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+from llmfoundry.command_utils.data_prep.convert_dataset_hf import (
+    convert_dataset_hf,
+    convert_dataset_hf_from_args,
+)
+from llmfoundry.command_utils.data_prep.convert_dataset_json import (
+    convert_dataset_json,
+    convert_dataset_json_from_args,
+)
+from llmfoundry.command_utils.data_prep.convert_text_to_mds import (
+    convert_text_to_mds,
+    convert_text_to_mds_from_args,
+)
+from llmfoundry.command_utils.eval import (
+    eval_from_yaml,
+    evaluate,
+)
+from llmfoundry.command_utils.train import (
+    TRAIN_CONFIG_KEYS,
+    TrainConfig,
+    train,
+    train_from_yaml,
+    validate_config,
+)
+
+__all__ = [
+    'train',
+    'train_from_yaml',
+    'TrainConfig',
+    'TRAIN_CONFIG_KEYS',
+    'validate_config',
+    'evaluate',
+    'eval_from_yaml',
+    'convert_dataset_hf',
+    'convert_dataset_hf_from_args',
+    'convert_dataset_json',
+    'convert_dataset_json_from_args',
+    'convert_text_to_mds',
+    'convert_text_to_mds_from_args',
+]
diff --git a/llmfoundry/command_utils/data_prep/__init__.py b/llmfoundry/command_utils/data_prep/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright 2024 MosaicML LLM Foundry authors
		# SPDX-License-Identifier: Apache-2.0