Add convert_dataset_hf to CLI (#1348)

* convert_dataset_hf * precommit * precommit * arguemnt * optino * back to fully option * typer is a pain * comma sep * checks * test * typo * clean imports * commit comments 1 * commit comments 2 (precommit hell) * script args * typer defaults * precommit * bruh * precommit * yapf * cli * update annotation * update annotation * merge * merge + refactor * typo * typo * move app * typo --------- Co-authored-by: v-chen_data <[email protected]>
mosaicml · Jul 16, 2024 · e7bf8db · e7bf8db
1 parent cabc1a7
commit e7bf8db
Show file tree

Hide file tree

Showing 9 changed files with 658 additions and 504 deletions.
diff --git a/llmfoundry/cli/cli.py b/llmfoundry/cli/cli.py
@@ -1,40 +1,51 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Optional
+from typing import Annotated, Optional
 
-import typer
+from typer import Argument, Typer
 
-from llmfoundry.cli import registry_cli
-from llmfoundry.command_utils import eval_from_yaml, train_from_yaml
+from llmfoundry.cli import (
+    data_prep_cli,
+    registry_cli,
+)
+from llmfoundry.command_utils import (
+    eval_from_yaml,
+    train_from_yaml,
+)
 
-app = typer.Typer(pretty_exceptions_show_locals=False)
+app = Typer(pretty_exceptions_show_locals=False)
 app.add_typer(registry_cli.app, name='registry')
+app.add_typer(data_prep_cli.app, name='data_prep')
 
 
 @app.command(name='train')
 def train(
-    yaml_path: str = typer.Argument(
-        ...,
-        help='Path to the YAML configuration file',
-    ),  # type: ignore
-    args_list: Optional[list[str]] = typer.
-    Argument(None, help='Additional command line arguments'),  # type: ignore
+    yaml_path: Annotated[str,
+                         Argument(
+                             ...,
+                             help='Path to the YAML configuration file',
+                         )],
+    args_list: Annotated[
+        Optional[list[str]],
+        Argument(help='Additional command line arguments')] = None,
 ):
     """Run the training with optional overrides from CLI."""
     train_from_yaml(yaml_path, args_list)
 
 
 @app.command(name='eval')
 def eval(
-    yaml_path: str = typer.Argument(
-        ...,
-        help='Path to the YAML configuration file',
-    ),  # type: ignore
-    args_list: Optional[list[str]] = typer.
-    Argument(None, help='Additional command line arguments'),  # type: ignore
+    yaml_path: Annotated[str,
+                         Argument(
+                             ...,
+                             help='Path to the YAML configuration file',
+                         )],
+    args_list: Annotated[
+        Optional[list[str]],
+        Argument(help='Additional command line arguments')] = None,
 ):
-    """Run the training with optional overrides from CLI."""
+    """Run the eval with optional overrides from CLI."""
     eval_from_yaml(yaml_path, args_list)
 
 

diff --git a/llmfoundry/cli/data_prep_cli.py b/llmfoundry/cli/data_prep_cli.py
@@ -0,0 +1,61 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Annotated, Optional
+
+from typer import Option, Typer
+
+from llmfoundry.command_utils import (
+    convert_dataset_hf_from_args,
+)
+
+app = Typer(pretty_exceptions_show_locals=False)
+
+
+@app.command(name='convert_dataset_hf')
+def convert_dataset_hf(
+    dataset: Annotated[str, Option(..., help='Name of the dataset')],
+    out_root: Annotated[str, Option(..., help='Output root directory')],
+    data_subset: Annotated[
+        Optional[str],
+        Option(help='Subset of the dataset (e.g., "all" or "en")'),
+    ] = None,
+    splits: Annotated[str,
+                      Option(help='Comma-separated list of dataset splits',),
+                     ] = 'train, train_small, val, val_small, val_xsmall',
+    compression: Annotated[Optional[str],
+                           Option(help='Compression type')] = None,
+    concat_tokens: Annotated[
+        Optional[int],
+        Option(help='Concatenate tokens up to this many tokens')] = None,
+    tokenizer: Annotated[Optional[str],
+                         Option(help='Tokenizer name')] = None,
+    tokenizer_kwargs: Annotated[
+        Optional[str],
+        Option(help='Tokenizer keyword arguments in JSON format')] = None,
+    bos_text: Annotated[Optional[str], Option(help='BOS text')] = None,
+    eos_text: Annotated[Optional[str], Option(help='EOS text')] = None,
+    no_wrap: Annotated[
+        bool,
+        Option(help='Do not wrap text across max_length boundaries'),
+    ] = False,
+    num_workers: Annotated[Optional[int],
+                           Option(help='Number of workers')] = None,
+):
+    """Converts dataset from HuggingFace into JSON files."""
+    # Convert comma-separated splits into a list
+    splits_list = splits.split(',') if splits else []
+    convert_dataset_hf_from_args(
+        dataset=dataset,
+        data_subset=data_subset,
+        splits=splits_list,
+        out_root=out_root,
+        compression=compression,
+        concat_tokens=concat_tokens,
+        tokenizer=tokenizer,
+        tokenizer_kwargs=tokenizer_kwargs,
+        bos_text=bos_text,
+        eos_text=eos_text,
+        no_wrap=no_wrap,
+        num_workers=num_workers,
+    )
diff --git a/llmfoundry/cli/registry_cli.py b/llmfoundry/cli/registry_cli.py
@@ -3,15 +3,15 @@
 
 from typing import Optional
 
-import typer
 from rich.console import Console
 from rich.table import Table
+from typer import Typer
 
 from llmfoundry import registry
 from llmfoundry.utils.registry_utils import TypedRegistry
 
 console = Console()
-app = typer.Typer(pretty_exceptions_show_locals=False)
+app = Typer(pretty_exceptions_show_locals=False)
 
 
 def _get_registries(group: Optional[str] = None) -> list[TypedRegistry]:

diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py
@@ -1,5 +1,9 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
+from llmfoundry.command_utils.data_prep.convert_dataset_hf import (
+    convert_dataset_hf,
+    convert_dataset_hf_from_args,
+)
 from llmfoundry.command_utils.eval import (
     eval_from_yaml,
     evaluate,
@@ -20,4 +24,6 @@
     'validate_config',
     'evaluate',
     'eval_from_yaml',
+    'convert_dataset_hf',
+    'convert_dataset_hf_from_args',
 ]