Skip to content

Commit

Permalink
fix missing argument descriptions
Browse files Browse the repository at this point in the history
  • Loading branch information
Eitan Turok committed Aug 1, 2024
1 parent caae74d commit 95869eb
Show file tree
Hide file tree
Showing 19 changed files with 156 additions and 45 deletions.
2 changes: 1 addition & 1 deletion llmfoundry/command_utils/data_prep/convert_dataset_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def build_hf_dataset(
"""Build an IterableDataset over the HF C4 or pile source data.
Args:
dataset_name (str): Dataset name
path (str): Dataset name
split (str): Split name.
mode (ConcatMode): NO_CONCAT, or CONCAT_TOKENS
max_length (int): The length of concatenated tokens
Expand Down
26 changes: 15 additions & 11 deletions llmfoundry/command_utils/data_prep/convert_delta_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,16 @@ def to_cf(self: 'SparkConnectClient',
return the schema and drops all other responses.
Args:
plan (pb2.Plan): The plan object to be executed by spark.
type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
self (SparkConnectClient): The SparkConnectClient we are processing.
plan (pb2.Plan): The plan object to be executed by spark.
type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
Returns:
Tuple[List[Result], int, bool]: A tuple containing:
- A list of Result namedtuples, each containing a URL, row count, compressed size,
and uncompressed size of the part of the result.
- Total row count of all parts of the result.
- A boolean indicating whether the result has been truncated.
Tuple[List[Result], int, bool]: A tuple containing:
- A list of Result namedtuples, each containing a URL, row count, compressed size,
and uncompressed size of the part of the result.
- Total row count of all parts of the result.
- A boolean indicating whether the result has been truncated.
"""
req = self._execute_plan_request_with_metadata()
req.plan.CopyFrom(plan)
Expand Down Expand Up @@ -120,8 +121,9 @@ def to_cf(self: 'SparkConnectClient',
)

# Create the iterator
from pyspark.sql.connect.client.reattach import \
ExecutePlanResponseReattachableIterator
from pyspark.sql.connect.client.reattach import (
ExecutePlanResponseReattachableIterator,
)
iterator = ExecutePlanResponseReattachableIterator(
req,
self._stub,
Expand Down Expand Up @@ -169,6 +171,7 @@ def collect_as_cf(self: 'DataFrame',
uses the `to_cf` method to execute the plan and fetch results as presigned URLs.
Args:
self (pd.DataFrame): The dataframe we are processing.
type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
Returns:
Expand Down Expand Up @@ -693,8 +696,9 @@ def _check_imports():
import pyspark.sql.connect.proto.cloud_pb2 as cloud_pb2
from pyspark.sql import SparkSession
from pyspark.sql.connect.client.core import SparkConnectClient
from pyspark.sql.connect.client.reattach import \
ExecutePlanResponseReattachableIterator
from pyspark.sql.connect.client.reattach import (
ExecutePlanResponseReattachableIterator,
)
from pyspark.sql.connect.dataframe import DataFrame
from pyspark.sql.dataframe import DataFrame as SparkDataFrame
from pyspark.sql.types import Row
Expand Down
86 changes: 74 additions & 12 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,12 @@ def build_finetuning_dataloader(
on which you intend to use, as explained below.
Args:
name (str): The type of dataloader to build. Must = "finetuning".
---
*** HuggingFace dataset config fields ***
tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
prepare the data from raw text. Any missing sentinel tokens will
be added by the collator.
device_batch_size (int, float): The size of the batches (number of examples)
that the dataloader will produce.
dataset (Dict[str, Any]): A HuggingFace dataset config which contains the following fields:
dataset.hf_name (str, optional): The name of the HuggingFace dataset
to use. Can also be a remote http(s) directory or object store bucket
containing the file {split}.jsonl in the format (prompt, response),
Expand Down Expand Up @@ -130,16 +133,32 @@ def build_finetuning_dataloader(
The script `scripts/misc/profile_packing.py` can help
you choose the best packing_ratio.
dataset.shuffle (bool): Whether to shuffle the dataset.
___
See :class:`StreamingFinetuningDataset` for info on other standard config
options within `dataset` that will be passed as kwargs if
using the streaming codepath.
---
tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
prepare the data from raw text. Any missing sentinel tokens will
be added by the collator.
device_batch_size (int, float): The size of the batches (number of examples)
that the dataloader will produce.
num_workers (int, optional): How many subprocesses to use for data loading.
0 means that the data will be loaded in the main process. The default is 0.
This argument is passed directly to the pytorch :class:`DataLoader`.
drop_last (bool, optional): If true, drop the last incomplete batch, if the dataset
size is not divisible by the batch size. If False and the size of dataset is
not divisible by the batch size, then the last batch will be smaller. The
default is False. This argument is passed directly to the pytorch :class:`DataLoader`.
pin_memory (bool, optional): If True, the data loader will copy Tensors into device/CUDA
pinned memory before returning them. If your data elements are a custom type, or your
`collate_fn` returns a batch that is a custom type. This argument is passed directly to
the pytorch :class:`DataLoader`.
prefetch_factor (int, optional): Number of batches loaded in advance by each worker.
2 means there will be a total of 2 * num_workers batches prefetched across all workers.
(default value depends on the set value for num_workers. If value of num_workers=0 default
is None. Otherwise, if value of num_workers > 0 default is 2). This argument is passed
directly to the pytorch :class:`DataLoader`.
persistent_workers (bool, optional): If True, the data loader will not shut down the worker
processes after a dataset has been consumed once. This allows to maintain the workers
Dataset instances alive. The default is False. This argument is passed directly to the
pytorch :class:`DataLoader`.
timeout (int, optional): If positive, the timeout value for collecting a batch from workers.
Should always be non-negative. The default is 0. This argument is passed directly to the
pytorch :class:`DataLoader`.
See :class:`DataLoader` for standard argument options to the pytorch
dataloader, such as `drop_last`, `num_workers`, etc.
Expand Down Expand Up @@ -357,7 +376,50 @@ def _validate_config(
the other.
Args:
dataset_cfg (DictConfig): The dataset configuration to be validated.
max_seq_len (int): The maximum length of sequences
in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
for details.
decoder_only_format (bool): Whether to format the
examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
docstring for details.
hf_name (str, optional): The name of the HuggingFace dataset
to use. Can also be a remote http(s) directory or object store bucket
containing the file {split}.jsonl in the format (prompt, response),
in which case the builder will create a HuggingFace dataset.
local (str, optional): Local path where remote data
will be streamed to. Only valid if `cfg.dataset.remote` has
also been set.
remote (str, optional): Location of a MDS-formatted
streaming dataset to use. Setting this will tell the builder
to create a streaming dataset rather than a HuggingFace dataset.
hf_kwargs (DictConfig, optional): Additional kwargs to
pass to `datasets.load_dataset`, which can be used to load
a dataset from local files.
preprocessing_fn (str, optional): The name/import path of
the preprocessing function to use for formatting the data examples.
If ``None`` (default), the builder will use the preprocessing function
registered under `hf_name` (see `tasks.py`), if one exists,
otherwise it will skip preprocessing.
If `preprocessing_fn` corresponds to a registered preprocessing
function in `tasks.py`, the builder will use that.
Otherwise, it will interpret `preprocessing_fn` as a
"import.path:function_name" import path; e.g., it will call
`from import.path import function_name` and use the imported
function as the preprocessing function.
safe_load (bool, optional): Whether to enforce safe loading of the dataset.
If `None`, will default to not applying any safe loading.
streams (Dict[str, Any], optional): A dictionary with multiple data streams.
If `None`, will assume no streams.
target_prompts (str): Which prompts are used as training targets.
Defaults to "none", meaning prompts are never used as training targets.
See :class:`Seq2SeqFinetuningCollator` docstring for details.
target_responses (str): Which responses are used as training targets.
Defaults to "last", meaning only the final response in multi-turn examples
will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
details.
kwargs (DictConfig, optional): Additional kwargs to
pass to `datasets.load_dataset`, which can be used to load
a dataset from local files.
Raises:
ValueError: If the dataset configuration does not meet the requirements.
Expand Down Expand Up @@ -504,7 +566,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
completed, the function removes the signal file.
Args:
hf_name (str): The path of the HuggingFace dataset to download.
remote_path (str): The path of the HuggingFace dataset to download.
split (str): The dataset split to download (e.g., 'train', 'validation', 'test').
Returns:
Expand Down
34 changes: 30 additions & 4 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
stitch_turns_decoder_only,
stitch_turns_encoder_decoder,
)

# yapf: disable
from llmfoundry.utils.exceptions import (
ALLOWED_MESSAGES_KEYS,
Expand All @@ -88,6 +89,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
UnableToProcessPromptResponseError,
UnknownExampleTypeError,
)

# yapf: enable
from llmfoundry.utils.logging_utils import SpecificWarningFilter

Expand Down Expand Up @@ -162,7 +164,7 @@ def _is_empty_or_nonexistent(dirpath: str) -> bool:
Args:
dirpath (str): Directory path to check.
Returns
Returns:
True if directory is empty or non-existent. False otherwise.
"""
return not os.path.isdir(dirpath) or len(os.listdir(dirpath)) == 0
Expand Down Expand Up @@ -820,9 +822,33 @@ def build_from_hf(
Note: This function will drop examples where the prompt is longer than the max_seq_len
Args:
cfg (DictConfig): The dataset configuration.
max_seq_len (int): The maximum sequence length. Examples with prompts longer than this will be dropped.
tokenizer (Tokenizer): The tokenizer to be used for tokenizing the dataset.
dataset_name (str): The name of the HuggingFace dataset
to use. Can also be a remote http(s) directory or object store bucket
containing the file {split}.jsonl in the format (prompt, response),
in which case the builder will create a HuggingFace dataset.
split (str): The split of the HuggingFace dataset.
safe_load (bool, optional): Whether to enforce safe loading of the dataset.
If `None`, will default to not applying any safe loading.
max_seq_len (int): The maximum length of sequences
in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
for details.
preprocessing_fn (Callable, optional): The preprocessing function to use for
formatting the data examples.
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for tokenizing
the HuggingFace dataset.
target_prompts (str): Which prompts are used as training targets.
Defaults to "none", meaning prompts are never used as training targets.
See :class:`Seq2SeqFinetuningCollator` docstring for details.
target_responses (str): Which responses are used as training targets.
Defaults to "last", meaning only the final response in multi-turn examples
will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
details.
decoder_only_format (bool): Whether to format the
examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
docstring for details.
hf_kwargs (DictConfig, optional): Additional kwargs to
pass to `datasets.load_dataset`, which can be used to load
a dataset from local files.
Returns:
Dataset: The tokenized dataset.
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/data/packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def auto_packing_ratio(
dataloader_cfg (DictConfig): The dataloader configuration for profiling.
tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling.
device_batch_size (int): The size of the batches (number of examples) per device.
num_packing_ratio (int): The number of packing ratios to try.
num_packing_ratios (int): The number of packing ratios to try.
Returns:
A packing ratio that minimizes padding while maintaining zero waste.
Expand Down
20 changes: 12 additions & 8 deletions llmfoundry/eval/datasets/in_context_learning_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,10 +249,13 @@ def read_dataset(
Returns:
dataset: A loaded HF dataset
"""
from datasets import \
Dataset as HFDataset # pyright: ignore[reportGeneralTypeIssues]
from datasets import \
load_dataset # pyright: ignore[reportGeneralTypeIssues]
from datasets import (
Dataset as HFDataset,) # pyright: ignore[reportGeneralTypeIssues]
from datasets import (
load_dataset,) # pyright: ignore[reportGeneralTypeIssues]
from datasets import (
,
)
if 'hf://' in dataset_uri:
dataset_uri = dataset_uri.replace('hf://', '')
if hf_loading_vars is None:
Expand Down Expand Up @@ -1129,6 +1132,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
since the batch may consist of multiple questions, the choice_groupings indicates
which contiguous sequences of elements in the batch correspond to which question
gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
Args:
data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
Expand Down Expand Up @@ -1168,6 +1172,7 @@ def split_batch(self, batch: Any,
and real example, which refers to one possible continuation. As example count and
microbatch_size are tracked in logical example, we split logical attributes by
microbatch_size and real attributes by microbatch_size * num_choices.
Args:
batch (Dict): Batch of data
microbatch_size (int | float): Size of microbatches
Expand Down Expand Up @@ -1643,8 +1648,7 @@ def get_icl_task_dataloader(
# At this point, hf_model is randomly initialized
composer_model = HuggingFaceModel(hf_model, hf_tokenizer)
Example:
Example:
.. testcode::
Expand Down Expand Up @@ -1685,8 +1689,8 @@ def get_icl_task_dataloader(
hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset.
kwargs (Dict[str, Any], default=None): Dictionary containing a mapping
from ICL dataset constructor's parameter names and their desired values.
destination_path: Where the dataloader will be saved.
kwargs (Dict[str, Any], default=None): Dictionary containing a mapping from ICL dataset constructor's parameter names and their desired values.
Returns:
DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/eval/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def make_padded_input(
Args:
context_enc (List): The encoded input to the model
continuation_enc (List): The encoded desired output for the example
max_seq_list (int): Maximum length sequences can be
max_seq_len (int): Maximum length sequences can be
pad_tok_id (int): The token id we pad with
padding_side (str): Which side to pad the context on. Can be 'right' or 'left
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/eval/metrics/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def update(
Args:
batch (dict): Batch must consist minimally of `input_ids` as well as any other structure needed
to compute the metric.
output_logits (torch.Tensor): The model outputs evaluated on the batch `input_ids`
outputs (torch.Tensor): The model outputs evaluated on the batch `input_ids`.
labels (torch.Tensor): The correct outputs.
Raises:
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/models/hf/hf_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ def build_inner_model(
use_auth_token (bool): Whether to use an authentication token.
config_overrides (Dict[str, Any]): The configuration overrides.
load_in_8bit (bool): Whether to load in 8-bit.
pretrained (bool): Whether the model is pretrained.
prepare_for_fsdp (bool, optional): Whether to prepare the model for FSDP wrapping. Default: False.
Returns:
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/models/layers/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,7 @@ def get_qkv(
Args:
x (torch.Tensor): The input tensor.
prev_layer_key_value (Optional[Tuple[torch.Tensor, torch.Tensor]]): The key value of the previous layer.
Returns:
query (torch.Tensor): The query tensor.
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/models/layers/ffn.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,7 @@ def set_ffn_device_mesh(
ffn (nn.Module): The FFN module.
moe_world_size (int): The MoE world size.
device_mesh (DeviceMesh): The full device mesh.
get_fsdp_submesh (Callable[[DeviceMesh], DeviceMesh]): A function to get the fsdp submesh.
Raises:
RuntimeError: If the device mesh is 3D.
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/models/mpt/configuration_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def __init__(
reuse_kv_layer:
attn_config:
reuse_kv_layer_idx: -6 # Relative index of the layer whose kv cache to reuse
kwargs (Any): Other relevant keyword arguments.
"""
self.d_model = d_model
self.n_heads = n_heads
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/tokenizers/tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def __init__(
errors (str, optional): Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
Defaults to `"replace"`.
kwargs (Any): Other relevant keyword arguments.
"""
try:
import tiktoken
Expand Down
Loading

0 comments on commit 95869eb

Please sign in to comment.