Skip to content

Commit

Permalink
Merge branch 'main' into more-registry-docs
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg authored Aug 1, 2024
2 parents dfa24c1 + cae89a2 commit b244deb
Show file tree
Hide file tree
Showing 22 changed files with 261 additions and 128 deletions.
25 changes: 15 additions & 10 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,28 @@ concurrency:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
pytest-cpu:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
name: ${{ matrix.name }}
runs-on: ubuntu-latest
strategy:
matrix:
include:
- name: "cpu-2.3.1"
pip_deps: "[all-cpu]"
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
markers: "not gpu"
pytest_command: "coverage run -m pytest"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
name: ${{ matrix.name }}
pip_deps: "[all-cpu]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
safe_directory: llm-foundry
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Run PR CPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
name: ${{ matrix.name }}
container: ${{ matrix.container }}
pip_deps: ${{ matrix.pip_deps }}
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
safe_directory: llm-foundry
coverage:
uses: ./.github/workflows/coverage.yaml
name: Coverage Results
Expand Down
108 changes: 59 additions & 49 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,87 +4,97 @@ on:
branches:
- main
- release/*
pull_request_target:
pull_request:
branches:
- main
- release/**
workflow_dispatch:
# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
pytest-gpu-1:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
name: ${{ matrix.name }}
runs-on: linux-ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- name: "gpu-2.3.1-1"
container: mosaicml/llm-foundry:2.3.1_cu121-latest
markers: "gpu"
pytest_command: "coverage run -m pytest"
pip_deps: "[all]"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud-timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
gpu_num: 1
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
pytest_command: "coverage run -m pytest"
ci_repo_gpu_test_ref: v0.1.0
steps:
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud_timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.9
gpu_num: 1
mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
pytest-gpu-2:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
name: ${{ matrix.name }}
runs-on: linux-ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- name: "gpu-2.3.1-2"
container: mosaicml/llm-foundry:2.3.1_cu121-latest
markers: "gpu"
pytest_command: "coverage run -m pytest"
pip_deps: "[all]"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud-timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
gpu_num: 2
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
pytest_command: "coverage run -m pytest"
ci_repo_gpu_test_ref: v0.1.0
steps:
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud_timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.9
gpu_num: 2
mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
pytest-gpu-4:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
name: ${{ matrix.name }}
runs-on: linux-ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- name: "gpu-2.3.1-4"
container: mosaicml/llm-foundry:2.3.1_cu121-latest
markers: "gpu"
pytest_command: "coverage run -m pytest"
pip_deps: "[all]"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud-timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
gpu_num: 4
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
pytest_command: "coverage run -m pytest"
ci_repo_gpu_test_ref: v0.1.0
steps:
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud_timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.9
gpu_num: 4
mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
11 changes: 0 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,6 @@ repos:
hooks:
- id: docformatter
args: [--in-place, --wrap-summaries=80, --wrap-descriptions=80]
- repo: https://github.com/PyCQA/pydocstyle
hooks:
- id: pydocstyle
name: pydocstyle
entry: pydocstyle
language: python
types: [python]
exclude: (.ci|.github)
additional_dependencies:
- toml
rev: 6.1.1
- repo: https://github.com/adrienverge/yamllint.git
rev: v1.28.0
hooks:
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/command_utils/data_prep/convert_dataset_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def build_hf_dataset(
"""Build an IterableDataset over the HF C4 or pile source data.
Args:
dataset_name (str): Dataset name
path (str): Dataset name
split (str): Split name.
mode (ConcatMode): NO_CONCAT, or CONCAT_TOKENS
max_length (int): The length of concatenated tokens
Expand Down
26 changes: 15 additions & 11 deletions llmfoundry/command_utils/data_prep/convert_delta_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,16 @@ def to_cf(self: 'SparkConnectClient',
return the schema and drops all other responses.
Args:
plan (pb2.Plan): The plan object to be executed by spark.
type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
self (SparkConnectClient): The SparkConnectClient we are processing.
plan (pb2.Plan): The plan object to be executed by spark.
type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
Returns:
Tuple[List[Result], int, bool]: A tuple containing:
- A list of Result namedtuples, each containing a URL, row count, compressed size,
and uncompressed size of the part of the result.
- Total row count of all parts of the result.
- A boolean indicating whether the result has been truncated.
Tuple[List[Result], int, bool]: A tuple containing:
- A list of Result namedtuples, each containing a URL, row count, compressed size,
and uncompressed size of the part of the result.
- Total row count of all parts of the result.
- A boolean indicating whether the result has been truncated.
"""
req = self._execute_plan_request_with_metadata()
req.plan.CopyFrom(plan)
Expand Down Expand Up @@ -120,8 +121,9 @@ def to_cf(self: 'SparkConnectClient',
)

# Create the iterator
from pyspark.sql.connect.client.reattach import \
ExecutePlanResponseReattachableIterator
from pyspark.sql.connect.client.reattach import (
ExecutePlanResponseReattachableIterator,
)
iterator = ExecutePlanResponseReattachableIterator(
req,
self._stub,
Expand Down Expand Up @@ -169,6 +171,7 @@ def collect_as_cf(self: 'DataFrame',
uses the `to_cf` method to execute the plan and fetch results as presigned URLs.
Args:
self (pd.DataFrame): The dataframe we are processing.
type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'.
Returns:
Expand Down Expand Up @@ -693,8 +696,9 @@ def _check_imports():
import pyspark.sql.connect.proto.cloud_pb2 as cloud_pb2
from pyspark.sql import SparkSession
from pyspark.sql.connect.client.core import SparkConnectClient
from pyspark.sql.connect.client.reattach import \
ExecutePlanResponseReattachableIterator
from pyspark.sql.connect.client.reattach import (
ExecutePlanResponseReattachableIterator,
)
from pyspark.sql.connect.dataframe import DataFrame
from pyspark.sql.dataframe import DataFrame as SparkDataFrame
from pyspark.sql.types import Row
Expand Down
86 changes: 74 additions & 12 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,12 @@ def build_finetuning_dataloader(
on which you intend to use, as explained below.
Args:
name (str): The type of dataloader to build. Must = "finetuning".
---
*** HuggingFace dataset config fields ***
tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
prepare the data from raw text. Any missing sentinel tokens will
be added by the collator.
device_batch_size (int, float): The size of the batches (number of examples)
that the dataloader will produce.
dataset (Dict[str, Any]): A HuggingFace dataset config which contains the following fields:
dataset.hf_name (str, optional): The name of the HuggingFace dataset
to use. Can also be a remote http(s) directory or object store bucket
containing the file {split}.jsonl in the format (prompt, response),
Expand Down Expand Up @@ -130,16 +133,32 @@ def build_finetuning_dataloader(
The script `scripts/misc/profile_packing.py` can help
you choose the best packing_ratio.
dataset.shuffle (bool): Whether to shuffle the dataset.
___
See :class:`StreamingFinetuningDataset` for info on other standard config
options within `dataset` that will be passed as kwargs if
using the streaming codepath.
---
tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
prepare the data from raw text. Any missing sentinel tokens will
be added by the collator.
device_batch_size (int, float): The size of the batches (number of examples)
that the dataloader will produce.
num_workers (int, optional): How many subprocesses to use for data loading.
0 means that the data will be loaded in the main process. The default is 0.
This argument is passed directly to the pytorch :class:`DataLoader`.
drop_last (bool, optional): If true, drop the last incomplete batch, if the dataset
size is not divisible by the batch size. If False and the size of dataset is
not divisible by the batch size, then the last batch will be smaller. The
default is False. This argument is passed directly to the pytorch :class:`DataLoader`.
pin_memory (bool, optional): If True, the data loader will copy Tensors into device/CUDA
pinned memory before returning them. If your data elements are a custom type, or your
`collate_fn` returns a batch that is a custom type. This argument is passed directly to
the pytorch :class:`DataLoader`.
prefetch_factor (int, optional): Number of batches loaded in advance by each worker.
2 means there will be a total of 2 * num_workers batches prefetched across all workers.
(default value depends on the set value for num_workers. If value of num_workers=0 default
is None. Otherwise, if value of num_workers > 0 default is 2). This argument is passed
directly to the pytorch :class:`DataLoader`.
persistent_workers (bool, optional): If True, the data loader will not shut down the worker
processes after a dataset has been consumed once. This allows to maintain the workers
Dataset instances alive. The default is False. This argument is passed directly to the
pytorch :class:`DataLoader`.
timeout (int, optional): If positive, the timeout value for collecting a batch from workers.
Should always be non-negative. The default is 0. This argument is passed directly to the
pytorch :class:`DataLoader`.
See :class:`DataLoader` for standard argument options to the pytorch
dataloader, such as `drop_last`, `num_workers`, etc.
Expand Down Expand Up @@ -357,7 +376,50 @@ def _validate_config(
the other.
Args:
dataset_cfg (DictConfig): The dataset configuration to be validated.
max_seq_len (int): The maximum length of sequences
in the batch. See :class:`Seq2SeqFinetuningCollator` docstring
for details.
decoder_only_format (bool): Whether to format the
examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator`
docstring for details.
hf_name (str, optional): The name of the HuggingFace dataset
to use. Can also be a remote http(s) directory or object store bucket
containing the file {split}.jsonl in the format (prompt, response),
in which case the builder will create a HuggingFace dataset.
local (str, optional): Local path where remote data
will be streamed to. Only valid if `cfg.dataset.remote` has
also been set.
remote (str, optional): Location of a MDS-formatted
streaming dataset to use. Setting this will tell the builder
to create a streaming dataset rather than a HuggingFace dataset.
hf_kwargs (DictConfig, optional): Additional kwargs to
pass to `datasets.load_dataset`, which can be used to load
a dataset from local files.
preprocessing_fn (str, optional): The name/import path of
the preprocessing function to use for formatting the data examples.
If ``None`` (default), the builder will use the preprocessing function
registered under `hf_name` (see `tasks.py`), if one exists,
otherwise it will skip preprocessing.
If `preprocessing_fn` corresponds to a registered preprocessing
function in `tasks.py`, the builder will use that.
Otherwise, it will interpret `preprocessing_fn` as a
"import.path:function_name" import path; e.g., it will call
`from import.path import function_name` and use the imported
function as the preprocessing function.
safe_load (bool, optional): Whether to enforce safe loading of the dataset.
If `None`, will default to not applying any safe loading.
streams (Dict[str, Any], optional): A dictionary with multiple data streams.
If `None`, will assume no streams.
target_prompts (str): Which prompts are used as training targets.
Defaults to "none", meaning prompts are never used as training targets.
See :class:`Seq2SeqFinetuningCollator` docstring for details.
target_responses (str): Which responses are used as training targets.
Defaults to "last", meaning only the final response in multi-turn examples
will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
details.
kwargs (DictConfig, optional): Additional kwargs to
pass to `datasets.load_dataset`, which can be used to load
a dataset from local files.
Raises:
ValueError: If the dataset configuration does not meet the requirements.
Expand Down Expand Up @@ -504,7 +566,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
completed, the function removes the signal file.
Args:
hf_name (str): The path of the HuggingFace dataset to download.
remote_path (str): The path of the HuggingFace dataset to download.
split (str): The dataset split to download (e.g., 'train', 'validation', 'test').
Returns:
Expand Down
Loading

0 comments on commit b244deb

Please sign in to comment.