Merge branch 'main' into more-registry-docs

mosaicml · Aug 1, 2024 · b244deb · b244deb
2 parents dfa24c1 + cae89a2
commit b244deb
Show file tree

Hide file tree

Showing 22 changed files with 261 additions and 128 deletions.
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -15,23 +15,28 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 jobs:
   pytest-cpu:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected]
+    name: ${{ matrix.name }}
+    runs-on: ubuntu-latest
     strategy:
       matrix:
         include:
         - name: "cpu-2.3.1"
+          pip_deps: "[all-cpu]"
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: "not gpu"
           pytest_command: "coverage run -m pytest"
-    name: ${{ matrix.name }}
-    if: github.repository_owner == 'mosaicml'
-    with:
-      container: ${{ matrix.container }}
-      name: ${{ matrix.name }}
-      pip_deps: "[all-cpu]"
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      safe_directory: llm-foundry
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+    - name: Run PR CPU Tests
+      uses: mosaicml/ci-testing/.github/actions/[email protected]
+      with:
+        name: ${{ matrix.name }}
+        container: ${{ matrix.container }}
+        pip_deps: ${{ matrix.pip_deps }}
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        safe_directory: llm-foundry
   coverage:
     uses: ./.github/workflows/coverage.yaml
     name: Coverage Results

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -4,87 +4,97 @@ on:
     branches:
     - main
     - release/*
-  pull_request_target:
+  pull_request:
     branches:
     - main
     - release/**
   workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 jobs:
   pytest-gpu-1:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected]
+    name: ${{ matrix.name }}
+    runs-on: linux-ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
         include:
         - name: "gpu-2.3.1-1"
           container: mosaicml/llm-foundry:2.3.1_cu121-latest
           markers: "gpu"
-          pytest_command: "coverage run -m pytest"
           pip_deps: "[all]"
-    name: ${{ matrix.name }}
-    if: github.repository_owner == 'mosaicml'
-    with:
-      container: ${{ matrix.container }}
-      git_repo: mosaicml/llm-foundry
-      mcloud-timeout: 1800
-      name: ${{ matrix.name }}
-      pip_deps: ${{ matrix.pip_deps }}
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
-      gpu_num: 1
-    secrets:
-      mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
+          pytest_command: "coverage run -m pytest"
+          ci_repo_gpu_test_ref: v0.1.0
+    steps:
+    - name: Run PR GPU Tests
+      uses: mosaicml/ci-testing/.github/actions/[email protected]
+      with:
+        container: ${{ matrix.container }}
+        git_repo: mosaicml/llm-foundry
+        mcloud_timeout: 1800
+        name: ${{ matrix.name }}
+        pip_deps: ${{ matrix.pip_deps }}
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        python_version: 3.9
+        gpu_num: 1
+        mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
+        ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
   pytest-gpu-2:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected]
+    name: ${{ matrix.name }}
+    runs-on: linux-ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
         include:
         - name: "gpu-2.3.1-2"
           container: mosaicml/llm-foundry:2.3.1_cu121-latest
           markers: "gpu"
-          pytest_command: "coverage run -m pytest"
           pip_deps: "[all]"
-    name: ${{ matrix.name }}
-    if: github.repository_owner == 'mosaicml'
-    with:
-      container: ${{ matrix.container }}
-      git_repo: mosaicml/llm-foundry
-      mcloud-timeout: 1800
-      name: ${{ matrix.name }}
-      pip_deps: ${{ matrix.pip_deps }}
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
-      gpu_num: 2
-    secrets:
-      mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
+          pytest_command: "coverage run -m pytest"
+          ci_repo_gpu_test_ref: v0.1.0
+    steps:
+    - name: Run PR GPU Tests
+      uses: mosaicml/ci-testing/.github/actions/[email protected]
+      with:
+        container: ${{ matrix.container }}
+        git_repo: mosaicml/llm-foundry
+        mcloud_timeout: 1800
+        name: ${{ matrix.name }}
+        pip_deps: ${{ matrix.pip_deps }}
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        python_version: 3.9
+        gpu_num: 2
+        mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
+        ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
   pytest-gpu-4:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected]
+    name: ${{ matrix.name }}
+    runs-on: linux-ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
         include:
         - name: "gpu-2.3.1-4"
           container: mosaicml/llm-foundry:2.3.1_cu121-latest
           markers: "gpu"
-          pytest_command: "coverage run -m pytest"
           pip_deps: "[all]"
-    name: ${{ matrix.name }}
-    if: github.repository_owner == 'mosaicml'
-    with:
-      container: ${{ matrix.container }}
-      git_repo: mosaicml/llm-foundry
-      mcloud-timeout: 1800
-      name: ${{ matrix.name }}
-      pip_deps: ${{ matrix.pip_deps }}
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
-      gpu_num: 4
-    secrets:
-      mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
+          pytest_command: "coverage run -m pytest"
+          ci_repo_gpu_test_ref: v0.1.0
+    steps:
+    - name: Run PR GPU Tests
+      uses: mosaicml/ci-testing/.github/actions/[email protected]
+      with:
+        container: ${{ matrix.container }}
+        git_repo: mosaicml/llm-foundry
+        mcloud_timeout: 1800
+        name: ${{ matrix.name }}
+        pip_deps: ${{ matrix.pip_deps }}
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        python_version: 3.9
+        gpu_num: 4
+        mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
+        ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -77,17 +77,6 @@ repos:
   hooks:
   - id: docformatter
     args: [--in-place, --wrap-summaries=80, --wrap-descriptions=80]
-- repo: https://github.com/PyCQA/pydocstyle
-  hooks:
-  - id: pydocstyle
-    name: pydocstyle
-    entry: pydocstyle
-    language: python
-    types: [python]
-    exclude: (.ci|.github)
-    additional_dependencies:
-    - toml
-  rev: 6.1.1
 - repo: https://github.com/adrienverge/yamllint.git
   rev: v1.28.0
   hooks:

diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_json.py b/llmfoundry/command_utils/data_prep/convert_dataset_json.py
@@ -34,7 +34,7 @@ def build_hf_dataset(
     """Build an IterableDataset over the HF C4 or pile source data.
 
     Args:
-        dataset_name (str): Dataset name
+        path (str): Dataset name
         split (str): Split name.
         mode (ConcatMode): NO_CONCAT, or CONCAT_TOKENS
         max_length (int): The length of concatenated tokens

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -78,15 +78,16 @@ def to_cf(self: 'SparkConnectClient',
     return the schema and drops all other responses.
 
     Args:
-       plan (pb2.Plan): The plan object to be executed by spark.
-       type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
+        self (SparkConnectClient): The SparkConnectClient we are processing.
+        plan (pb2.Plan): The plan object to be executed by spark.
+        type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
 
     Returns:
-       Tuple[List[Result], int, bool]: A tuple containing:
-           - A list of Result namedtuples, each containing a URL, row count, compressed size,
-             and uncompressed size of the part of the result.
-           - Total row count of all parts of the result.
-           - A boolean indicating whether the result has been truncated.
+        Tuple[List[Result], int, bool]: A tuple containing:
+            - A list of Result namedtuples, each containing a URL, row count, compressed size,
+                and uncompressed size of the part of the result.
+            - Total row count of all parts of the result.
+            - A boolean indicating whether the result has been truncated.
     """
     req = self._execute_plan_request_with_metadata()
     req.plan.CopyFrom(plan)
@@ -120,8 +121,9 @@ def to_cf(self: 'SparkConnectClient',
     )
 
     # Create the iterator
-    from pyspark.sql.connect.client.reattach import \
-        ExecutePlanResponseReattachableIterator
+    from pyspark.sql.connect.client.reattach import (
+        ExecutePlanResponseReattachableIterator,
+    )
     iterator = ExecutePlanResponseReattachableIterator(
         req,
         self._stub,
@@ -169,6 +171,7 @@ def collect_as_cf(self: 'DataFrame',
     uses the `to_cf` method to execute the plan and fetch results as presigned URLs.
 
     Args:
+        self (pd.DataFrame): The dataframe we are processing.
         type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
 
     Returns:
@@ -693,8 +696,9 @@ def _check_imports():
         import pyspark.sql.connect.proto.cloud_pb2 as cloud_pb2
         from pyspark.sql import SparkSession
         from pyspark.sql.connect.client.core import SparkConnectClient
-        from pyspark.sql.connect.client.reattach import \
-            ExecutePlanResponseReattachableIterator
+        from pyspark.sql.connect.client.reattach import (
+            ExecutePlanResponseReattachableIterator,
+        )
         from pyspark.sql.connect.dataframe import DataFrame
         from pyspark.sql.dataframe import DataFrame as SparkDataFrame
         from pyspark.sql.types import Row

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
@@ -64,9 +64,12 @@ def build_finetuning_dataloader(
     on which you intend to use, as explained below.
 
     Args:
-        name (str): The type of dataloader to build. Must = "finetuning".
-        ---
-        *** HuggingFace dataset config fields ***
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
+            prepare the data from raw text. Any missing sentinel tokens will
+            be added by the collator.
+        device_batch_size (int, float): The size of the batches (number of examples)
+            that the dataloader will produce.
+        dataset (Dict[str, Any]): A HuggingFace dataset config which contains the following fields:
             dataset.hf_name (str, optional): The name of the HuggingFace dataset
                 to use. Can also be a remote http(s) directory or object store bucket
                 containing the file {split}.jsonl in the format (prompt, response),
@@ -130,16 +133,32 @@ def build_finetuning_dataloader(
                     The script `scripts/misc/profile_packing.py` can help
                     you choose the best packing_ratio.
             dataset.shuffle (bool): Whether to shuffle the dataset.
-            ___
             See :class:`StreamingFinetuningDataset` for info on other standard config
                 options within `dataset` that will be passed as kwargs if
                 using the streaming codepath.
-            ---
-        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
-            prepare the data from raw text. Any missing sentinel tokens will
-            be added by the collator.
-        device_batch_size (int, float): The size of the batches (number of examples)
-            that the dataloader will produce.
+        num_workers (int, optional): How many subprocesses to use for data loading.
+            0 means that the data will be loaded in the main process. The default is 0.
+            This argument is passed directly to the pytorch :class:`DataLoader`.
+        drop_last (bool, optional): If true, drop the last incomplete batch, if the dataset
+            size is not divisible by the batch size. If False and the size of dataset is
+            not divisible by the batch size, then the last batch will be smaller. The
+            default is False. This argument is passed directly to the pytorch :class:`DataLoader`.
+        pin_memory (bool, optional): If True, the data loader will copy Tensors into device/CUDA
+            pinned memory before returning them. If your data elements are a custom type, or your
+            `collate_fn` returns a batch that is a custom type. This argument is passed directly to
+            the pytorch :class:`DataLoader`.
+        prefetch_factor (int, optional): Number of batches loaded in advance by each worker.
+            2 means there will be a total of 2 * num_workers batches prefetched across all workers.
+            (default value depends on the set value for num_workers. If value of num_workers=0 default
+            is None. Otherwise, if value of num_workers > 0 default is 2). This argument is passed
+            directly to the pytorch :class:`DataLoader`.
+        persistent_workers (bool, optional): If True, the data loader will not shut down the worker
+            processes after a dataset has been consumed once. This allows to maintain the workers
+            Dataset instances alive. The default is False. This argument is passed directly to the
+            pytorch :class:`DataLoader`.
+        timeout (int, optional): If positive, the timeout value for collecting a batch from workers.
+            Should always be non-negative. The default is 0. This argument is passed directly to the
+            pytorch :class:`DataLoader`.
         See :class:`DataLoader` for standard argument options to the pytorch
             dataloader, such as `drop_last`, `num_workers`, etc.
 
@@ -357,7 +376,50 @@ def _validate_config(
     the other.
 
     Args:
-        dataset_cfg (DictConfig): The dataset configuration to be validated.
+        max_seq_len (int): The maximum length of sequences
+            in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
+            for details.
+        decoder_only_format (bool): Whether to format the
+            examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
+            docstring for details.
+        hf_name (str, optional): The name of the HuggingFace dataset
+            to use. Can also be a remote http(s) directory or object store bucket
+            containing the file {split}.jsonl in the format (prompt, response),
+            in which case the builder will create a HuggingFace dataset.
+        local (str, optional): Local path where remote data
+            will be streamed to. Only valid if `cfg.dataset.remote` has
+            also been set.
+        remote (str, optional): Location of a MDS-formatted
+            streaming dataset to use. Setting this will tell the builder
+            to create a streaming dataset rather than a HuggingFace dataset.
+        hf_kwargs (DictConfig, optional): Additional kwargs to
+            pass to `datasets.load_dataset`, which can be used to load
+            a dataset from local files.
+        preprocessing_fn (str, optional): The name/import path of
+            the preprocessing function to use for formatting the data examples.
+            If ``None`` (default), the builder will use the preprocessing function
+                registered under `hf_name` (see `tasks.py`), if one exists,
+                otherwise it will skip preprocessing.
+            If `preprocessing_fn` corresponds to a registered preprocessing
+                function in `tasks.py`, the builder will use that.
+            Otherwise, it will interpret `preprocessing_fn` as a
+                "import.path:function_name" import path; e.g., it will call
+                `from import.path import function_name` and use the imported
+                function as the preprocessing function.
+        safe_load (bool, optional): Whether to enforce safe loading of the dataset.
+            If `None`, will default to not applying any safe loading.
+        streams (Dict[str, Any], optional): A dictionary with multiple data streams.
+            If `None`, will assume no streams.
+        target_prompts (str): Which prompts are used as training targets.
+            Defaults to "none", meaning prompts are never used as training targets.
+            See :class:`Seq2SeqFinetuningCollator` docstring for details.
+        target_responses (str): Which responses are used as training targets.
+            Defaults to "last", meaning only the final response in multi-turn examples
+            will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
+            details.
+        kwargs (DictConfig, optional): Additional kwargs to
+                pass to `datasets.load_dataset`, which can be used to load
+                a dataset from local files.
 
     Raises:
         ValueError: If the dataset configuration does not meet the requirements.
@@ -504,7 +566,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
     completed, the function removes the signal file.
 
     Args:
-        hf_name (str): The path of the HuggingFace dataset to download.
+        remote_path (str): The path of the HuggingFace dataset to download.
         split (str): The dataset split to download (e.g., 'train', 'validation', 'test').
 
     Returns: