Skip to content

Commit

Permalink
Merge branch 'main' into jchang/hf_checkpointer_resize_embedding
Browse files Browse the repository at this point in the history
  • Loading branch information
jdchang1 authored Aug 3, 2024
2 parents 43c965c + 38dcf1e commit 3ffdb9e
Show file tree
Hide file tree
Showing 79 changed files with 4,957 additions and 3,023 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/code-quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ defaults:
working-directory: .
jobs:
code-quality:
runs-on: ubuntu-20.04
runs-on: linux-ubuntu-latest
timeout-minutes: 30
strategy:
matrix:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
jobs:
coverage:
timeout-minutes: 5
runs-on: ubuntu-latest
runs-on: linux-ubuntu-latest
steps:
- name: Checkout Repo
uses: actions/checkout@v3
Expand Down
22 changes: 13 additions & 9 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,6 @@ jobs:
base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
dep_groups: "[gpu]"
steps:
- name: Maximize Build Space on Worker
uses: easimon/maximize-build-space@v4
with:
overprovision-lvm: true
remove-dotnet: true
remove-android: true
remove-haskell: true

- name: Checkout
uses: actions/checkout@v3
Expand All @@ -47,6 +40,13 @@ jobs:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}

- name: Login to GHCR
uses: docker/login-action@v2
with:
username: ${{ secrets.GHCR_USERNAME }}
password: ${{ secrets.GHCR_TOKEN }}
registry: ghcr.io

- name: Calculate Docker Image Variables
run: |
set -euxo pipefail
Expand All @@ -60,13 +60,17 @@ jobs:
if [ "${{ github.event_name }}" == "pull_request" ]; then
echo "Triggered by pull_request event."
STAGING_REPO="mosaicml/ci-staging"
IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA}"
GHCR_STAGING_REPO="ghcr.io/databricks-mosaic/ci-staging"
GHCR_IMAGE_TAG="${GHCR_STAGING_REPO}:${{matrix.name}}-${GIT_SHA}"
IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA},${GHCR_IMAGE_TAG}"
IMAGE_CACHE="${STAGING_REPO}:${{matrix.name}}-buildcache"
else
# Triggered by push or workflow_dispatch event
echo "Triggered by ${{ github.event_name }} event, releasing to prod"
PROD_REPO="mosaicml/llm-foundry"
IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest"
GHCR_PROD_REPO="ghcr.io/databricks-mosaic/llm-foundry"
GHCR_IMAGE_TAG="${GHCR_PROD_REPO}:${{matrix.name}}-${GIT_SHA},${GHCR_PROD_REPO}:${{matrix.name}}-latest"
IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest,${GHCR_IMAGE_TAG}"
IMAGE_CACHE="${PROD_REPO}:${{matrix.name}}-buildcache"
fi
Expand Down
25 changes: 15 additions & 10 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,28 @@ concurrency:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
pytest-cpu:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
name: ${{ matrix.name }}
runs-on: ubuntu-latest
strategy:
matrix:
include:
- name: "cpu-2.3.1"
pip_deps: "[all-cpu]"
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
markers: "not gpu"
pytest_command: "coverage run -m pytest"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
name: ${{ matrix.name }}
pip_deps: "[all-cpu]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
safe_directory: llm-foundry
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Run PR CPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
name: ${{ matrix.name }}
container: ${{ matrix.container }}
pip_deps: ${{ matrix.pip_deps }}
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
safe_directory: llm-foundry
coverage:
uses: ./.github/workflows/coverage.yaml
name: Coverage Results
Expand Down
105 changes: 59 additions & 46 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,82 +9,95 @@ on:
- main
- release/**
workflow_dispatch:
# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
pytest-gpu-1:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
runs-on: linux-ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- name: "gpu-2.3.1-1"
container: mosaicml/llm-foundry:2.3.1_cu121-latest
markers: "gpu"
pytest_command: "coverage run -m pytest"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
ci_repo_gpu_test_ref: v0.1.0
steps:
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud_timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.9
gpu_num: 1
mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
pytest-gpu-2:
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud-timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
gpu_num: 1
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
pytest-gpu-2:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
runs-on: linux-ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- name: "gpu-2.3.1-2"
container: mosaicml/llm-foundry:2.3.1_cu121-latest
markers: "gpu"
pytest_command: "coverage run -m pytest"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
ci_repo_gpu_test_ref: v0.1.0
steps:
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud_timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.9
gpu_num: 2
mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
pytest-gpu-4:
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud-timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
gpu_num: 2
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
pytest-gpu-4:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
runs-on: linux-ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- name: "gpu-2.3.1-4"
container: mosaicml/llm-foundry:2.3.1_cu121-latest
markers: "gpu"
pytest_command: "coverage run -m pytest"
pip_deps: "[all]"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud-timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
gpu_num: 4
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
pytest_command: "coverage run -m pytest"
ci_repo_gpu_test_ref: v0.1.0
steps:
- name: Run PR GPU Tests
uses: mosaicml/ci-testing/.github/actions/[email protected]
with:
container: ${{ matrix.container }}
git_repo: mosaicml/llm-foundry
mcloud_timeout: 1800
name: ${{ matrix.name }}
pip_deps: ${{ matrix.pip_deps }}
pytest_command: ${{ matrix.pytest_command }}
pytest_markers: ${{ matrix.markers }}
python_version: 3.9
gpu_num: 4
mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
2 changes: 1 addition & 1 deletion .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
name: Build and Publish llm-foundry PyPI Package
needs:
- code-quality
runs-on: ubuntu-latest
runs-on: linux-ubuntu-latest
steps:
- name: Checkout source
uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/smoketest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ defaults:
working-directory: .
jobs:
smoketest:
runs-on: ubuntu-20.04
runs-on: linux-ubuntu-latest
timeout-minutes: 20
strategy:
matrix:
Expand Down
11 changes: 0 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,6 @@ repos:
hooks:
- id: docformatter
args: [--in-place, --wrap-summaries=80, --wrap-descriptions=80]
- repo: https://github.com/PyCQA/pydocstyle
hooks:
- id: pydocstyle
name: pydocstyle
entry: pydocstyle
language: python
types: [python]
exclude: (.ci|.github)
additional_dependencies:
- toml
rev: 6.1.1
- repo: https://github.com/adrienverge/yamllint.git
rev: v1.28.0
hooks:
Expand Down
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ DBRX is a state-of-the-art open source LLM trained by Databricks Mosaic team. It
| DBRX Base | 32768 | https://huggingface.co/databricks/dbrx-base |
| DBRX Instruct | 32768 | https://huggingface.co/databricks/dbrx-instruct |

Our model weights and code are licensed for both researchers and commercial entities. The Databricks Open Source License can be found at [LICENSE](https://github.com/databricks/dbrx/LICENSE), and our Acceptable Use Policy can be found [here](https://www.databricks.com/legal/acceptable-use-policy-open-model).
Our model weights and code are licensed for both researchers and commercial entities. The Databricks Open Source License can be found at [LICENSE](https://github.com/databricks/dbrx/blob/main/LICENSE), and our Acceptable Use Policy can be found [here](https://www.databricks.com/legal/acceptable-use-policy-open-model).

For more information about the DBRX models, see https://github.com/databricks/dbrx.

Expand Down Expand Up @@ -309,10 +309,15 @@ dependencies = [
"llm-foundry",
]

# Note: Even though in python code, this would be llmfoundry.registry.loggers,
# when specified in the entry_points, it has to be "llmfoundry_loggers". That is,
# the segments of the name should be joined by an _ in the entry_points section.
[project.entry-points."llmfoundry_loggers"]
my_logger = "foundry_registry.loggers:MyLogger"
```

If developing new components via entrypoints, it is important to note that Python entrypoints are global to the Python environment. This means that if you have multiple packages that register components with the same key, the last one installed will be the one used. This can be useful for overriding components in LLM Foundry, but can also lead to unexpected behavior if not careful. Additionally, if you change the pyproject.toml, you will need to reinstall the package for the changes to take effect. You can do this quickly by installing with `pip install -e . --no-deps` to avoid reinstalling dependencies.

### Direct call to register

You can also register a component directly in your code:
Expand Down
4 changes: 2 additions & 2 deletions llmfoundry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
tokenizers,
utils,
)
from llmfoundry._version import __version__
from llmfoundry.data import StreamingFinetuningDataset, StreamingTextDataset
from llmfoundry.eval import InContextLearningDataset, InContextLearningMetric
from llmfoundry.models.hf import ComposerHFCausalLM
Expand All @@ -63,6 +64,7 @@
from llmfoundry.optim import DecoupledLionW

__all__ = [
'__version__',
'StreamingFinetuningDataset',
'StreamingTextDataset',
'InContextLearningDataset',
Expand All @@ -87,5 +89,3 @@
'tokenizers',
'utils',
]

__version__ = '0.11.0.dev0'
6 changes: 6 additions & 0 deletions llmfoundry/_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

"""The LLM Foundry Version."""

__version__ = '0.11.0.dev'
3 changes: 2 additions & 1 deletion llmfoundry/callbacks/async_eval_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,8 @@ def launch_run(self, checkpoint: str, current_interval: Time) -> Run:
installation_path = i['path']

if not found_llm_foundry:
from llmfoundry import __version__ as latest_foundry_version
from llmfoundry._version import \
__version__ as latest_foundry_version

# If github integration is not found, foundry is likely installed
# through the run command. In this case, we'll add the integration
Expand Down
28 changes: 8 additions & 20 deletions llmfoundry/callbacks/curriculum_learning_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,18 +128,17 @@ def after_load(self, state: State, logger: Logger):
self._validate_dataloader(state.train_dataloader)

# If checkpoint was saved before iteration was incremented, we need to increment it now
duration = self._schedule[self._schedule_index]['duration']
if ((
self._schedule[self._schedule_index]['duration'].unit
== TimeUnit.TOKEN and state.timestamp.token_in_iteration >=
self._schedule[self._schedule_index]['duration'].value
duration.unit == TimeUnit.TOKEN and
state.timestamp.token_in_iteration >= duration.value
) or (
self._schedule[self._schedule_index]['duration'].unit
== TimeUnit.EPOCH and state.timestamp.epoch_in_iteration >=
self._schedule[self._schedule_index]['duration'].value
duration.unit == TimeUnit.EPOCH and
state.timestamp.epoch_in_iteration >= duration.value
)):
log.warning((
'The CurriculumLearning callback has detected that the previous run did not correctly '
'increment the iteration.'
'The CurriculumLearning callback has detected that the '
'previous run did not correctly increment the iteration.'
))
self._schedule_index += 1
state.timestamp = state.timestamp.to_next_iteration()
Expand Down Expand Up @@ -199,24 +198,13 @@ def load_state_dict(self, state: dict[str, Any]):
f'Expected {saved_loader} but got {current_loader}',
))

# Ensure that the current datamix duration is greater than timestamp
# Ensure that the current datamix duration is in the correct units
duration = self._schedule[self._schedule_index]['duration']
if duration.unit != TimeUnit.TOKEN and duration.unit != TimeUnit.EPOCH:
raise ValueError((
f'Duration must be in terms of tokens or epochs, but got ',
f'{duration.unit}.',
))
if ((
duration.unit == TimeUnit.TOKEN and
duration > state['timestamp'].token_in_iteration
) or (
duration.unit == TimeUnit.EPOCH and
duration > state['timestamp'].epoch_in_iteration
)):
raise ValueError((
'The duration of the current datamix must be less or equal to '
'than the saved timestamp.'
))

def _build_train_loader(
self,
Expand Down
Loading

0 comments on commit 3ffdb9e

Please sign in to comment.