Skip to content

Commit

Permalink
Merge branch 'main' into eval_script_model
Browse files Browse the repository at this point in the history
  • Loading branch information
josejg committed Jul 18, 2024
2 parents 9465603 + acb5530 commit 5f07eae
Show file tree
Hide file tree
Showing 39 changed files with 2,573 additions and 1,715 deletions.
46 changes: 35 additions & 11 deletions llmfoundry/cli/cli.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,53 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

from typing import Optional
from typing import Annotated, Optional

import typer
from typer import Argument, Typer

from llmfoundry.cli import registry_cli
from llmfoundry.train import train_from_yaml
from llmfoundry.cli import (
data_prep_cli,
registry_cli,
)
from llmfoundry.command_utils import (
eval_from_yaml,
train_from_yaml,
)

app = typer.Typer(pretty_exceptions_show_locals=False)
app = Typer(pretty_exceptions_show_locals=False)
app.add_typer(registry_cli.app, name='registry')
app.add_typer(data_prep_cli.app, name='data_prep')


@app.command(name='train')
def train(
yaml_path: str = typer.Argument(
...,
help='Path to the YAML configuration file',
), # type: ignore
args_list: Optional[list[str]] = typer.
Argument(None, help='Additional command line arguments'), # type: ignore
yaml_path: Annotated[str,
Argument(
...,
help='Path to the YAML configuration file',
)],
args_list: Annotated[
Optional[list[str]],
Argument(help='Additional command line arguments')] = None,
):
"""Run the training with optional overrides from CLI."""
train_from_yaml(yaml_path, args_list)


@app.command(name='eval')
def eval(
yaml_path: Annotated[str,
Argument(
...,
help='Path to the YAML configuration file',
)],
args_list: Annotated[
Optional[list[str]],
Argument(help='Additional command line arguments')] = None,
):
"""Run the eval with optional overrides from CLI."""
eval_from_yaml(yaml_path, args_list)


if __name__ == '__main__':
app()
150 changes: 150 additions & 0 deletions llmfoundry/cli/data_prep_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

from typing import Annotated, Optional

import psutil
from typer import Option, Typer

from llmfoundry.command_utils import (
convert_dataset_hf_from_args,
convert_dataset_json_from_args,
convert_text_to_mds_from_args,
)

app = Typer(pretty_exceptions_show_locals=False)


@app.command(name='convert_dataset_hf')
def convert_dataset_hf(
dataset: Annotated[str, Option(..., help='Name of the dataset')],
out_root: Annotated[str, Option(..., help='Output root directory')],
data_subset: Annotated[
Optional[str],
Option(help='Subset of the dataset (e.g., "all" or "en")'),
] = None,
splits: Annotated[str,
Option(help='Comma-separated list of dataset splits',),
] = 'train, train_small, val, val_small, val_xsmall',
compression: Annotated[Optional[str],
Option(help='Compression type')] = None,
concat_tokens: Annotated[
Optional[int],
Option(help='Concatenate tokens up to this many tokens')] = None,
tokenizer: Annotated[Optional[str],
Option(help='Tokenizer name')] = None,
tokenizer_kwargs: Annotated[
Optional[str],
Option(help='Tokenizer keyword arguments in JSON format')] = None,
bos_text: Annotated[Optional[str], Option(help='BOS text')] = None,
eos_text: Annotated[Optional[str], Option(help='EOS text')] = None,
no_wrap: Annotated[
bool,
Option(help='Do not wrap text across max_length boundaries'),
] = False,
num_workers: Annotated[Optional[int],
Option(help='Number of workers')] = None,
):
"""Converts dataset from HuggingFace into JSON files."""
# Convert comma-separated splits into a list
splits_list = splits.split(',') if splits else []
convert_dataset_hf_from_args(
dataset=dataset,
data_subset=data_subset,
splits=splits_list,
out_root=out_root,
compression=compression,
concat_tokens=concat_tokens,
tokenizer=tokenizer,
tokenizer_kwargs=tokenizer_kwargs,
bos_text=bos_text,
eos_text=eos_text,
no_wrap=no_wrap,
num_workers=num_workers,
)


@app.command(name='convert_dataset_json')
def convert_dataset_json(
path: Annotated[str, Option(..., help='Path to the input data file')],
out_root: Annotated[str, Option(..., help='Output root directory')],
concat_tokens: Annotated[
int,
Option(
...,
help='Convert text to tokens and concatenate up to this many tokens',
)],
tokenizer: Annotated[str, Option(..., help='Tokenizer name')],
compression: Annotated[Optional[str],
Option(help='Compression type, if any')] = 'zstd',
split: Annotated[str, Option(help='Dataset split to process')] = 'train',
bos_text: Annotated[
Optional[str],
Option(help='Text to insert at the beginning of each sequence')] = None,
eos_text: Annotated[
Optional[str],
Option(help='Text to insert at the end of each sequence')] = None,
no_wrap: Annotated[
bool,
Option(help='Do not wrap text across max_length boundaries')] = False,
num_workers: Annotated[
Optional[int],
Option(help='Number of workers for data loading')] = None,
):
"""Convert a dataset from JSON to MDS streaming format."""
convert_dataset_json_from_args(
path=path,
split=split,
out_root=out_root,
compression=compression,
concat_tokens=concat_tokens,
tokenizer=tokenizer,
bos_text=bos_text,
eos_text=eos_text,
no_wrap=no_wrap,
num_workers=num_workers,
)


@app.command(name='convert_text_to_mds')
def convert_text_to_mds(
output_folder: Annotated[str, Option(..., help='The folder to write output to')],
input_folder: Annotated[str, Option(..., help='The folder with text files to convert to MDS')],
concat_tokens: Annotated[int, Option(..., help='Convert text to tokens and concatenate up to this many tokens')],
tokenizer: Annotated[str, Option(..., help='The name of the tokenizer to use')],
bos_text: Annotated[Optional[str], Option(help='The text to prepend to each example to separate concatenated examples')] = None,
eos_text: Annotated[Optional[str], Option(help='The text to append to each example to separate concatenated examples')] = None,
compression: Annotated[str, Option(help='The compression algorithm to use for MDS writing')] = 'zstd',
use_tokenizer_eos: Annotated[bool, Option(help='Use the EOS text from the tokenizer')] = False,
no_wrap: Annotated[bool, Option(help='Whether to let text examples wrap across multiple training examples')] = False,
processes: Annotated[int, Option(
help='The number of processes to use to download and convert the dataset',
)] = min(max(psutil.cpu_count() - 2, 1), 32), # type: ignore
reprocess: Annotated[bool, Option(
help=
'If true, reprocess the input_folder to MDS format. Otherwise, only reprocess upon changes to the input folder or dataset creation parameters.',
)] = False,
trust_remote_code: Annotated[bool, Option(
help='If true, allows custom code to be executed to load the tokenizer',
)] = False,
logging_level: Annotated[str, Option(
help='Logging level for the script. Default is INFO.',
)] = 'INFO',

):
"""Convert text files to MDS streaming format."""
convert_text_to_mds_from_args(
output_folder=output_folder,
input_folder=input_folder,
compression=compression,
concat_tokens=concat_tokens,
tokenizer_name=tokenizer,
bos_text=bos_text,
eos_text=eos_text,
use_tokenizer_eos=use_tokenizer_eos,
no_wrap=no_wrap,
processes=processes,
reprocess=reprocess,
trust_remote_code=trust_remote_code,
logging_level=logging_level,
)
4 changes: 2 additions & 2 deletions llmfoundry/cli/registry_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@

from typing import Optional

import typer
from rich.console import Console
from rich.table import Table
from typer import Typer

from llmfoundry import registry
from llmfoundry.utils.registry_utils import TypedRegistry

console = Console()
app = typer.Typer(pretty_exceptions_show_locals=False)
app = Typer(pretty_exceptions_show_locals=False)


def _get_registries(group: Optional[str] = None) -> list[TypedRegistry]:
Expand Down
41 changes: 41 additions & 0 deletions llmfoundry/command_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0
from llmfoundry.command_utils.data_prep.convert_dataset_hf import (
convert_dataset_hf,
convert_dataset_hf_from_args,
)
from llmfoundry.command_utils.data_prep.convert_dataset_json import (
convert_dataset_json,
convert_dataset_json_from_args,
)
from llmfoundry.command_utils.data_prep.convert_text_to_mds import (
convert_text_to_mds,
convert_text_to_mds_from_args,
)
from llmfoundry.command_utils.eval import (
eval_from_yaml,
evaluate,
)
from llmfoundry.command_utils.train import (
TRAIN_CONFIG_KEYS,
TrainConfig,
train,
train_from_yaml,
validate_config,
)

__all__ = [
'train',
'train_from_yaml',
'TrainConfig',
'TRAIN_CONFIG_KEYS',
'validate_config',
'evaluate',
'eval_from_yaml',
'convert_dataset_hf',
'convert_dataset_hf_from_args',
'convert_dataset_json',
'convert_dataset_json_from_args',
'convert_text_to_mds',
'convert_text_to_mds_from_args',
]
2 changes: 2 additions & 0 deletions llmfoundry/command_utils/data_prep/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0
Loading

0 comments on commit 5f07eae

Please sign in to comment.