fix missing argument descriptions

mosaicml · Aug 1, 2024 · 95869eb · 95869eb
1 parent caae74d
commit 95869eb
Show file tree

Hide file tree

Showing 19 changed files with 156 additions and 45 deletions.
diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_json.py b/llmfoundry/command_utils/data_prep/convert_dataset_json.py
@@ -34,7 +34,7 @@ def build_hf_dataset(
     """Build an IterableDataset over the HF C4 or pile source data.
 
     Args:
-        dataset_name (str): Dataset name
+        path (str): Dataset name
         split (str): Split name.
         mode (ConcatMode): NO_CONCAT, or CONCAT_TOKENS
         max_length (int): The length of concatenated tokens

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -78,15 +78,16 @@ def to_cf(self: 'SparkConnectClient',
     return the schema and drops all other responses.
 
     Args:
-       plan (pb2.Plan): The plan object to be executed by spark.
-       type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
+        self (SparkConnectClient): The SparkConnectClient we are processing.
+        plan (pb2.Plan): The plan object to be executed by spark.
+        type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
 
     Returns:
-       Tuple[List[Result], int, bool]: A tuple containing:
-           - A list of Result namedtuples, each containing a URL, row count, compressed size,
-             and uncompressed size of the part of the result.
-           - Total row count of all parts of the result.
-           - A boolean indicating whether the result has been truncated.
+        Tuple[List[Result], int, bool]: A tuple containing:
+            - A list of Result namedtuples, each containing a URL, row count, compressed size,
+                and uncompressed size of the part of the result.
+            - Total row count of all parts of the result.
+            - A boolean indicating whether the result has been truncated.
     """
     req = self._execute_plan_request_with_metadata()
     req.plan.CopyFrom(plan)
@@ -120,8 +121,9 @@ def to_cf(self: 'SparkConnectClient',
     )
 
     # Create the iterator
-    from pyspark.sql.connect.client.reattach import \
-        ExecutePlanResponseReattachableIterator
+    from pyspark.sql.connect.client.reattach import (
+        ExecutePlanResponseReattachableIterator,
+    )
     iterator = ExecutePlanResponseReattachableIterator(
         req,
         self._stub,
@@ -169,6 +171,7 @@ def collect_as_cf(self: 'DataFrame',
     uses the `to_cf` method to execute the plan and fetch results as presigned URLs.
 
     Args:
+        self (pd.DataFrame): The dataframe we are processing.
         type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
 
     Returns:
@@ -693,8 +696,9 @@ def _check_imports():
         import pyspark.sql.connect.proto.cloud_pb2 as cloud_pb2
         from pyspark.sql import SparkSession
         from pyspark.sql.connect.client.core import SparkConnectClient
-        from pyspark.sql.connect.client.reattach import \
-            ExecutePlanResponseReattachableIterator
+        from pyspark.sql.connect.client.reattach import (
+            ExecutePlanResponseReattachableIterator,
+        )
         from pyspark.sql.connect.dataframe import DataFrame
         from pyspark.sql.dataframe import DataFrame as SparkDataFrame
         from pyspark.sql.types import Row

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
@@ -64,9 +64,12 @@ def build_finetuning_dataloader(
     on which you intend to use, as explained below.
 
     Args:
-        name (str): The type of dataloader to build. Must = "finetuning".
-        ---
-        *** HuggingFace dataset config fields ***
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
+            prepare the data from raw text. Any missing sentinel tokens will
+            be added by the collator.
+        device_batch_size (int, float): The size of the batches (number of examples)
+            that the dataloader will produce.
+        dataset (Dict[str, Any]): A HuggingFace dataset config which contains the following fields:
             dataset.hf_name (str, optional): The name of the HuggingFace dataset
                 to use. Can also be a remote http(s) directory or object store bucket
                 containing the file {split}.jsonl in the format (prompt, response),
@@ -130,16 +133,32 @@ def build_finetuning_dataloader(
                     The script `scripts/misc/profile_packing.py` can help
                     you choose the best packing_ratio.
             dataset.shuffle (bool): Whether to shuffle the dataset.
-            ___
             See :class:`StreamingFinetuningDataset` for info on other standard config
                 options within `dataset` that will be passed as kwargs if
                 using the streaming codepath.
-            ---
-        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
-            prepare the data from raw text. Any missing sentinel tokens will
-            be added by the collator.
-        device_batch_size (int, float): The size of the batches (number of examples)
-            that the dataloader will produce.
+        num_workers (int, optional): How many subprocesses to use for data loading.
+            0 means that the data will be loaded in the main process. The default is 0.
+            This argument is passed directly to the pytorch :class:`DataLoader`.
+        drop_last (bool, optional): If true, drop the last incomplete batch, if the dataset
+            size is not divisible by the batch size. If False and the size of dataset is
+            not divisible by the batch size, then the last batch will be smaller. The
+            default is False. This argument is passed directly to the pytorch :class:`DataLoader`.
+        pin_memory (bool, optional): If True, the data loader will copy Tensors into device/CUDA
+            pinned memory before returning them. If your data elements are a custom type, or your
+            `collate_fn` returns a batch that is a custom type. This argument is passed directly to
+            the pytorch :class:`DataLoader`.
+        prefetch_factor (int, optional): Number of batches loaded in advance by each worker.
+            2 means there will be a total of 2 * num_workers batches prefetched across all workers.
+            (default value depends on the set value for num_workers. If value of num_workers=0 default
+            is None. Otherwise, if value of num_workers > 0 default is 2). This argument is passed
+            directly to the pytorch :class:`DataLoader`.
+        persistent_workers (bool, optional): If True, the data loader will not shut down the worker
+            processes after a dataset has been consumed once. This allows to maintain the workers
+            Dataset instances alive. The default is False. This argument is passed directly to the
+            pytorch :class:`DataLoader`.
+        timeout (int, optional): If positive, the timeout value for collecting a batch from workers.
+            Should always be non-negative. The default is 0. This argument is passed directly to the
+            pytorch :class:`DataLoader`.
         See :class:`DataLoader` for standard argument options to the pytorch
             dataloader, such as `drop_last`, `num_workers`, etc.
 
@@ -357,7 +376,50 @@ def _validate_config(
     the other.
 
     Args:
-        dataset_cfg (DictConfig): The dataset configuration to be validated.
+        max_seq_len (int): The maximum length of sequences
+            in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
+            for details.
+        decoder_only_format (bool): Whether to format the
+            examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
+            docstring for details.
+        hf_name (str, optional): The name of the HuggingFace dataset
+            to use. Can also be a remote http(s) directory or object store bucket
+            containing the file {split}.jsonl in the format (prompt, response),
+            in which case the builder will create a HuggingFace dataset.
+        local (str, optional): Local path where remote data
+            will be streamed to. Only valid if `cfg.dataset.remote` has
+            also been set.
+        remote (str, optional): Location of a MDS-formatted
+            streaming dataset to use. Setting this will tell the builder
+            to create a streaming dataset rather than a HuggingFace dataset.
+        hf_kwargs (DictConfig, optional): Additional kwargs to
+            pass to `datasets.load_dataset`, which can be used to load
+            a dataset from local files.
+        preprocessing_fn (str, optional): The name/import path of
+            the preprocessing function to use for formatting the data examples.
+            If ``None`` (default), the builder will use the preprocessing function
+                registered under `hf_name` (see `tasks.py`), if one exists,
+                otherwise it will skip preprocessing.
+            If `preprocessing_fn` corresponds to a registered preprocessing
+                function in `tasks.py`, the builder will use that.
+            Otherwise, it will interpret `preprocessing_fn` as a
+                "import.path:function_name" import path; e.g., it will call
+                `from import.path import function_name` and use the imported
+                function as the preprocessing function.
+        safe_load (bool, optional): Whether to enforce safe loading of the dataset.
+            If `None`, will default to not applying any safe loading.
+        streams (Dict[str, Any], optional): A dictionary with multiple data streams.
+            If `None`, will assume no streams.
+        target_prompts (str): Which prompts are used as training targets.
+            Defaults to "none", meaning prompts are never used as training targets.
+            See :class:`Seq2SeqFinetuningCollator` docstring for details.
+        target_responses (str): Which responses are used as training targets.
+            Defaults to "last", meaning only the final response in multi-turn examples
+            will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
+            details.
+        kwargs (DictConfig, optional): Additional kwargs to
+                pass to `datasets.load_dataset`, which can be used to load
+                a dataset from local files.
 
     Raises:
         ValueError: If the dataset configuration does not meet the requirements.
@@ -504,7 +566,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
     completed, the function removes the signal file.
 
     Args:
-        hf_name (str): The path of the HuggingFace dataset to download.
+        remote_path (str): The path of the HuggingFace dataset to download.
         split (str): The dataset split to download (e.g., 'train', 'validation', 'test').
 
     Returns:

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -68,6 +68,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
     stitch_turns_decoder_only,
     stitch_turns_encoder_decoder,
 )
+
 # yapf: disable
 from llmfoundry.utils.exceptions import (
     ALLOWED_MESSAGES_KEYS,
@@ -88,6 +89,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
     UnableToProcessPromptResponseError,
     UnknownExampleTypeError,
 )
+
 #  yapf: enable
 from llmfoundry.utils.logging_utils import SpecificWarningFilter
 
@@ -162,7 +164,7 @@ def _is_empty_or_nonexistent(dirpath: str) -> bool:
     Args:
         dirpath (str): Directory path to check.
 
-    Returns
+    Returns:
         True if directory is empty or non-existent. False otherwise.
     """
     return not os.path.isdir(dirpath) or len(os.listdir(dirpath)) == 0
@@ -820,9 +822,33 @@ def build_from_hf(
         Note: This function will drop examples where the prompt is longer than the max_seq_len
 
         Args:
-            cfg (DictConfig): The dataset configuration.
-            max_seq_len (int): The maximum sequence length. Examples with prompts longer than this will be dropped.
-            tokenizer (Tokenizer): The tokenizer to be used for tokenizing the dataset.
+            dataset_name (str): The name of the HuggingFace dataset
+                to use. Can also be a remote http(s) directory or object store bucket
+                containing the file {split}.jsonl in the format (prompt, response),
+                in which case the builder will create a HuggingFace dataset.
+            split (str): The split of the HuggingFace dataset.
+            safe_load (bool, optional): Whether to enforce safe loading of the dataset.
+                If `None`, will default to not applying any safe loading.
+            max_seq_len (int): The maximum length of sequences
+                in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
+                for details.
+            preprocessing_fn (Callable, optional): The preprocessing function to use for
+                formatting the data examples.
+            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for tokenizing
+                the HuggingFace dataset.
+            target_prompts (str): Which prompts are used as training targets.
+                Defaults to "none", meaning prompts are never used as training targets.
+                See :class:`Seq2SeqFinetuningCollator` docstring for details.
+            target_responses (str): Which responses are used as training targets.
+                Defaults to "last", meaning only the final response in multi-turn examples
+                will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
+                details.
+            decoder_only_format (bool): Whether to format the
+                examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
+                docstring for details.
+            hf_kwargs (DictConfig, optional): Additional kwargs to
+                pass to `datasets.load_dataset`, which can be used to load
+                a dataset from local files.
 
         Returns:
             Dataset: The tokenized dataset.

diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
@@ -337,7 +337,7 @@ def auto_packing_ratio(
         dataloader_cfg (DictConfig): The dataloader configuration for profiling.
         tokenizer (PreTrainedTokenizerBase): The tokenizer for profiling.
         device_batch_size (int): The size of the batches (number of examples) per device.
-        num_packing_ratio (int): The number of packing ratios to try.
+        num_packing_ratios (int): The number of packing ratios to try.
 
     Returns:
         A packing ratio that minimizes padding while maintaining zero waste.

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -249,10 +249,13 @@ def read_dataset(
         Returns:
             dataset: A loaded HF dataset
         """
-        from datasets import \
-            Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
-        from datasets import \
-            load_dataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import (
+            Dataset as HFDataset,)  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import (
+            load_dataset,)  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import (
+            ,
+        )
         if 'hf://' in dataset_uri:
             dataset_uri = dataset_uri.replace('hf://', '')
             if hf_loading_vars is None:
@@ -1129,6 +1132,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         since the batch may consist of multiple questions, the choice_groupings indicates
         which contiguous sequences of elements in the batch correspond to which question
         gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
+
         Args:
             data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
 
@@ -1168,6 +1172,7 @@ def split_batch(self, batch: Any,
         and real example, which refers to one possible continuation. As example count and
         microbatch_size are tracked in logical example, we split logical attributes by
         microbatch_size and real attributes by microbatch_size * num_choices.
+
         Args:
             batch (Dict): Batch of data
             microbatch_size (int | float): Size of microbatches
@@ -1643,8 +1648,7 @@ def get_icl_task_dataloader(
             # At this point, hf_model is randomly initialized
             composer_model = HuggingFaceModel(hf_model, hf_tokenizer)
 
-        Example:
-
+    Example:
         .. testcode::
 
 
@@ -1685,8 +1689,8 @@ def get_icl_task_dataloader(
         hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
         hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
             Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset.
-        kwargs (Dict[str, Any], default=None): Dictionary containing a mapping
-        from ICL dataset constructor's parameter names and their desired values.
+        destination_path: Where the dataloader will be saved.
+        kwargs (Dict[str, Any], default=None): Dictionary containing a mapping from ICL dataset constructor's parameter names and their desired values.
 
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.

diff --git a/llmfoundry/eval/datasets/utils.py b/llmfoundry/eval/datasets/utils.py
@@ -130,7 +130,7 @@ def make_padded_input(
     Args:
         context_enc (List): The encoded input to the model
         continuation_enc (List): The encoded desired output for the example
-        max_seq_list (int): Maximum length sequences can be
+        max_seq_len (int): Maximum length sequences can be
         pad_tok_id (int): The token id we pad with
         padding_side (str): Which side to pad the context on. Can be 'right' or 'left
 

diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
@@ -80,7 +80,7 @@ def update(
         Args:
             batch (dict): Batch must consist minimally of `input_ids` as well as any other structure needed
                 to compute the metric.
-            output_logits (torch.Tensor): The model outputs evaluated on the batch `input_ids`
+            outputs (torch.Tensor): The model outputs evaluated on the batch `input_ids`.
             labels (torch.Tensor): The correct outputs.
 
         Raises:

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
@@ -205,6 +205,7 @@ def build_inner_model(
             use_auth_token (bool): Whether to use an authentication token.
             config_overrides (Dict[str, Any]): The configuration overrides.
             load_in_8bit (bool): Whether to load in 8-bit.
+            pretrained (bool): Whether the model is pretrained.
             prepare_for_fsdp (bool, optional): Whether to prepare the model for FSDP wrapping. Default: False.
 
         Returns:

diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
@@ -606,6 +606,7 @@ def get_qkv(
 
         Args:
             x (torch.Tensor): The input tensor.
+            prev_layer_key_value  (Optional[Tuple[torch.Tensor, torch.Tensor]]): The key value of the previous layer.
 
         Returns:
             query (torch.Tensor): The query tensor.

diff --git a/llmfoundry/models/layers/ffn.py b/llmfoundry/models/layers/ffn.py
@@ -429,6 +429,7 @@ def set_ffn_device_mesh(
         ffn (nn.Module): The FFN module.
         moe_world_size (int): The MoE world size.
         device_mesh (DeviceMesh): The full device mesh.
+        get_fsdp_submesh (Callable[[DeviceMesh], DeviceMesh]): A function to get the fsdp submesh.
 
     Raises:
         RuntimeError: If the device mesh is 3D.

diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
@@ -147,6 +147,7 @@ def __init__(
                             reuse_kv_layer:
                                 attn_config:
                                     reuse_kv_layer_idx: -6 # Relative index of the layer whose kv cache to reuse
+            kwargs (Any): Other relevant keyword arguments.
         """
         self.d_model = d_model
         self.n_heads = n_heads

diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
@@ -90,6 +90,7 @@ def __init__(
             errors (str, optional): Paradigm to follow when decoding bytes to UTF-8. See
                 [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
                 Defaults to `"replace"`.
+            kwargs (Any): Other relevant keyword arguments.
         """
         try:
             import tiktoken