Skip to content

Commit

Permalink
Add convert_dataset_hf to CLI (#1348)
Browse files Browse the repository at this point in the history
* convert_dataset_hf

* precommit

* precommit

* arguemnt

* optino

* back to fully option

* typer is a pain

* comma sep

* checks

* test

* typo

* clean imports

* commit comments 1

* commit comments 2 (precommit hell)

* script args

* typer defaults

* precommit

* bruh

* precommit

* yapf

* cli

* update annotation

* update annotation

* merge

* merge + refactor

* typo

* typo

* move app

* typo

---------

Co-authored-by: v-chen_data <[email protected]>
  • Loading branch information
KuuCi and v-chen_data authored Jul 16, 2024
1 parent cabc1a7 commit e7bf8db
Show file tree
Hide file tree
Showing 9 changed files with 658 additions and 504 deletions.
47 changes: 29 additions & 18 deletions llmfoundry/cli/cli.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,51 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

from typing import Optional
from typing import Annotated, Optional

import typer
from typer import Argument, Typer

from llmfoundry.cli import registry_cli
from llmfoundry.command_utils import eval_from_yaml, train_from_yaml
from llmfoundry.cli import (
data_prep_cli,
registry_cli,
)
from llmfoundry.command_utils import (
eval_from_yaml,
train_from_yaml,
)

app = typer.Typer(pretty_exceptions_show_locals=False)
app = Typer(pretty_exceptions_show_locals=False)
app.add_typer(registry_cli.app, name='registry')
app.add_typer(data_prep_cli.app, name='data_prep')


@app.command(name='train')
def train(
yaml_path: str = typer.Argument(
...,
help='Path to the YAML configuration file',
), # type: ignore
args_list: Optional[list[str]] = typer.
Argument(None, help='Additional command line arguments'), # type: ignore
yaml_path: Annotated[str,
Argument(
...,
help='Path to the YAML configuration file',
)],
args_list: Annotated[
Optional[list[str]],
Argument(help='Additional command line arguments')] = None,
):
"""Run the training with optional overrides from CLI."""
train_from_yaml(yaml_path, args_list)


@app.command(name='eval')
def eval(
yaml_path: str = typer.Argument(
...,
help='Path to the YAML configuration file',
), # type: ignore
args_list: Optional[list[str]] = typer.
Argument(None, help='Additional command line arguments'), # type: ignore
yaml_path: Annotated[str,
Argument(
...,
help='Path to the YAML configuration file',
)],
args_list: Annotated[
Optional[list[str]],
Argument(help='Additional command line arguments')] = None,
):
"""Run the training with optional overrides from CLI."""
"""Run the eval with optional overrides from CLI."""
eval_from_yaml(yaml_path, args_list)


Expand Down
61 changes: 61 additions & 0 deletions llmfoundry/cli/data_prep_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

from typing import Annotated, Optional

from typer import Option, Typer

from llmfoundry.command_utils import (
convert_dataset_hf_from_args,
)

app = Typer(pretty_exceptions_show_locals=False)


@app.command(name='convert_dataset_hf')
def convert_dataset_hf(
dataset: Annotated[str, Option(..., help='Name of the dataset')],
out_root: Annotated[str, Option(..., help='Output root directory')],
data_subset: Annotated[
Optional[str],
Option(help='Subset of the dataset (e.g., "all" or "en")'),
] = None,
splits: Annotated[str,
Option(help='Comma-separated list of dataset splits',),
] = 'train, train_small, val, val_small, val_xsmall',
compression: Annotated[Optional[str],
Option(help='Compression type')] = None,
concat_tokens: Annotated[
Optional[int],
Option(help='Concatenate tokens up to this many tokens')] = None,
tokenizer: Annotated[Optional[str],
Option(help='Tokenizer name')] = None,
tokenizer_kwargs: Annotated[
Optional[str],
Option(help='Tokenizer keyword arguments in JSON format')] = None,
bos_text: Annotated[Optional[str], Option(help='BOS text')] = None,
eos_text: Annotated[Optional[str], Option(help='EOS text')] = None,
no_wrap: Annotated[
bool,
Option(help='Do not wrap text across max_length boundaries'),
] = False,
num_workers: Annotated[Optional[int],
Option(help='Number of workers')] = None,
):
"""Converts dataset from HuggingFace into JSON files."""
# Convert comma-separated splits into a list
splits_list = splits.split(',') if splits else []
convert_dataset_hf_from_args(
dataset=dataset,
data_subset=data_subset,
splits=splits_list,
out_root=out_root,
compression=compression,
concat_tokens=concat_tokens,
tokenizer=tokenizer,
tokenizer_kwargs=tokenizer_kwargs,
bos_text=bos_text,
eos_text=eos_text,
no_wrap=no_wrap,
num_workers=num_workers,
)
4 changes: 2 additions & 2 deletions llmfoundry/cli/registry_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@

from typing import Optional

import typer
from rich.console import Console
from rich.table import Table
from typer import Typer

from llmfoundry import registry
from llmfoundry.utils.registry_utils import TypedRegistry

console = Console()
app = typer.Typer(pretty_exceptions_show_locals=False)
app = Typer(pretty_exceptions_show_locals=False)


def _get_registries(group: Optional[str] = None) -> list[TypedRegistry]:
Expand Down
6 changes: 6 additions & 0 deletions llmfoundry/command_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0
from llmfoundry.command_utils.data_prep.convert_dataset_hf import (
convert_dataset_hf,
convert_dataset_hf_from_args,
)
from llmfoundry.command_utils.eval import (
eval_from_yaml,
evaluate,
Expand All @@ -20,4 +24,6 @@
'validate_config',
'evaluate',
'eval_from_yaml',
'convert_dataset_hf',
'convert_dataset_hf_from_args',
]
Loading

0 comments on commit e7bf8db

Please sign in to comment.