Merge branch 'main' into jchang/hf_checkpointer_resize_embedding

mosaicml · Aug 3, 2024 · 3ffdb9e · 3ffdb9e
2 parents 43c965c + 38dcf1e
commit 3ffdb9e
Show file tree

Hide file tree

Showing 79 changed files with 4,957 additions and 3,023 deletions.
diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
@@ -19,7 +19,7 @@ defaults:
     working-directory: .
 jobs:
   code-quality:
-    runs-on: ubuntu-20.04
+    runs-on: linux-ubuntu-latest
     timeout-minutes: 30
     strategy:
       matrix:

diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
@@ -8,7 +8,7 @@ on:
 jobs:
   coverage:
     timeout-minutes: 5
-    runs-on: ubuntu-latest
+    runs-on: linux-ubuntu-latest
     steps:
     - name: Checkout Repo
       uses: actions/checkout@v3

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -24,13 +24,6 @@ jobs:
           base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
           dep_groups: "[gpu]"
     steps:
-    - name: Maximize Build Space on Worker
-      uses: easimon/maximize-build-space@v4
-      with:
-        overprovision-lvm: true
-        remove-dotnet: true
-        remove-android: true
-        remove-haskell: true
 
     - name: Checkout
       uses: actions/checkout@v3
@@ -47,6 +40,13 @@ jobs:
         username: ${{ secrets.DOCKER_HUB_USERNAME }}
         password: ${{ secrets.DOCKER_HUB_PASSWORD }}
 
+    - name: Login to GHCR
+      uses: docker/login-action@v2
+      with:
+        username: ${{ secrets.GHCR_USERNAME }}
+        password: ${{ secrets.GHCR_TOKEN }}
+        registry: ghcr.io
+
     - name: Calculate Docker Image Variables
       run: |
         set -euxo pipefail
@@ -60,13 +60,17 @@ jobs:
         if [ "${{ github.event_name }}" == "pull_request" ]; then
           echo "Triggered by pull_request event."
           STAGING_REPO="mosaicml/ci-staging"
-          IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA}"
+          GHCR_STAGING_REPO="ghcr.io/databricks-mosaic/ci-staging"
+          GHCR_IMAGE_TAG="${GHCR_STAGING_REPO}:${{matrix.name}}-${GIT_SHA}"
+          IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA},${GHCR_IMAGE_TAG}"
           IMAGE_CACHE="${STAGING_REPO}:${{matrix.name}}-buildcache"
         else
           # Triggered by push or workflow_dispatch event
           echo "Triggered by ${{ github.event_name }} event, releasing to prod"
           PROD_REPO="mosaicml/llm-foundry"
-          IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest"
+          GHCR_PROD_REPO="ghcr.io/databricks-mosaic/llm-foundry"
+          GHCR_IMAGE_TAG="${GHCR_PROD_REPO}:${{matrix.name}}-${GIT_SHA},${GHCR_PROD_REPO}:${{matrix.name}}-latest"
+          IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest,${GHCR_IMAGE_TAG}"
           IMAGE_CACHE="${PROD_REPO}:${{matrix.name}}-buildcache"
         fi
 

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -15,23 +15,28 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 jobs:
   pytest-cpu:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected]
+    name: ${{ matrix.name }}
+    runs-on: ubuntu-latest
     strategy:
       matrix:
         include:
         - name: "cpu-2.3.1"
+          pip_deps: "[all-cpu]"
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: "not gpu"
           pytest_command: "coverage run -m pytest"
-    name: ${{ matrix.name }}
-    if: github.repository_owner == 'mosaicml'
-    with:
-      container: ${{ matrix.container }}
-      name: ${{ matrix.name }}
-      pip_deps: "[all-cpu]"
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      safe_directory: llm-foundry
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+    - name: Run PR CPU Tests
+      uses: mosaicml/ci-testing/.github/actions/[email protected]
+      with:
+        name: ${{ matrix.name }}
+        container: ${{ matrix.container }}
+        pip_deps: ${{ matrix.pip_deps }}
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        safe_directory: llm-foundry
   coverage:
     uses: ./.github/workflows/coverage.yaml
     name: Coverage Results

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -9,82 +9,95 @@ on:
     - main
     - release/**
   workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 jobs:
   pytest-gpu-1:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected]
+    name: ${{ matrix.name }}
+    if: github.repository_owner == 'mosaicml'
+    runs-on: linux-ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
         include:
         - name: "gpu-2.3.1-1"
           container: mosaicml/llm-foundry:2.3.1_cu121-latest
           markers: "gpu"
-          pytest_command: "coverage run -m pytest"
           pip_deps: "[all]"
+          pytest_command: "coverage run -m pytest"
+          ci_repo_gpu_test_ref: v0.1.0
+    steps:
+    - name: Run PR GPU Tests
+      uses: mosaicml/ci-testing/.github/actions/[email protected]
+      with:
+        container: ${{ matrix.container }}
+        git_repo: mosaicml/llm-foundry
+        mcloud_timeout: 1800
+        name: ${{ matrix.name }}
+        pip_deps: ${{ matrix.pip_deps }}
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        python_version: 3.9
+        gpu_num: 1
+        mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
+        ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
+  pytest-gpu-2:
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
-    with:
-      container: ${{ matrix.container }}
-      git_repo: mosaicml/llm-foundry
-      mcloud-timeout: 1800
-      name: ${{ matrix.name }}
-      pip_deps: ${{ matrix.pip_deps }}
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
-      gpu_num: 1
-    secrets:
-      mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
-  pytest-gpu-2:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected]
+    runs-on: linux-ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
         include:
         - name: "gpu-2.3.1-2"
           container: mosaicml/llm-foundry:2.3.1_cu121-latest
           markers: "gpu"
-          pytest_command: "coverage run -m pytest"
           pip_deps: "[all]"
+          pytest_command: "coverage run -m pytest"
+          ci_repo_gpu_test_ref: v0.1.0
+    steps:
+    - name: Run PR GPU Tests
+      uses: mosaicml/ci-testing/.github/actions/[email protected]
+      with:
+        container: ${{ matrix.container }}
+        git_repo: mosaicml/llm-foundry
+        mcloud_timeout: 1800
+        name: ${{ matrix.name }}
+        pip_deps: ${{ matrix.pip_deps }}
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        python_version: 3.9
+        gpu_num: 2
+        mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
+        ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
+  pytest-gpu-4:
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
-    with:
-      container: ${{ matrix.container }}
-      git_repo: mosaicml/llm-foundry
-      mcloud-timeout: 1800
-      name: ${{ matrix.name }}
-      pip_deps: ${{ matrix.pip_deps }}
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
-      gpu_num: 2
-    secrets:
-      mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
-  pytest-gpu-4:
-    uses: mosaicml/ci-testing/.github/workflows/[email protected]
+    runs-on: linux-ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
         include:
         - name: "gpu-2.3.1-4"
           container: mosaicml/llm-foundry:2.3.1_cu121-latest
           markers: "gpu"
-          pytest_command: "coverage run -m pytest"
           pip_deps: "[all]"
-    name: ${{ matrix.name }}
-    if: github.repository_owner == 'mosaicml'
-    with:
-      container: ${{ matrix.container }}
-      git_repo: mosaicml/llm-foundry
-      mcloud-timeout: 1800
-      name: ${{ matrix.name }}
-      pip_deps: ${{ matrix.pip_deps }}
-      pytest-command: ${{ matrix.pytest_command }}
-      pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
-      gpu_num: 4
-    secrets:
-      mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
+          pytest_command: "coverage run -m pytest"
+          ci_repo_gpu_test_ref: v0.1.0
+    steps:
+    - name: Run PR GPU Tests
+      uses: mosaicml/ci-testing/.github/actions/[email protected]
+      with:
+        container: ${{ matrix.container }}
+        git_repo: mosaicml/llm-foundry
+        mcloud_timeout: 1800
+        name: ${{ matrix.name }}
+        pip_deps: ${{ matrix.pip_deps }}
+        pytest_command: ${{ matrix.pytest_command }}
+        pytest_markers: ${{ matrix.markers }}
+        python_version: 3.9
+        gpu_num: 4
+        mcloud_api_key: ${{ secrets.MCLOUD_API_KEY }}
+        ci_repo_gpu_test_ref: ${{ matrix.ci_repo_gpu_test_ref }}
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -14,7 +14,7 @@ jobs:
     name: Build and Publish llm-foundry PyPI Package
     needs:
     - code-quality
-    runs-on: ubuntu-latest
+    runs-on: linux-ubuntu-latest
     steps:
     - name: Checkout source
       uses: actions/checkout@v3

diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
@@ -18,7 +18,7 @@ defaults:
     working-directory: .
 jobs:
   smoketest:
-    runs-on: ubuntu-20.04
+    runs-on: linux-ubuntu-latest
     timeout-minutes: 20
     strategy:
       matrix:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -77,17 +77,6 @@ repos:
   hooks:
   - id: docformatter
     args: [--in-place, --wrap-summaries=80, --wrap-descriptions=80]
-- repo: https://github.com/PyCQA/pydocstyle
-  hooks:
-  - id: pydocstyle
-    name: pydocstyle
-    entry: pydocstyle
-    language: python
-    types: [python]
-    exclude: (.ci|.github)
-    additional_dependencies:
-    - toml
-  rev: 6.1.1
 - repo: https://github.com/adrienverge/yamllint.git
   rev: v1.28.0
   hooks:

diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ DBRX is a state-of-the-art open source LLM trained by Databricks Mosaic team. It
 | DBRX Base          | 32768          | https://huggingface.co/databricks/dbrx-base        |
 | DBRX Instruct      | 32768          | https://huggingface.co/databricks/dbrx-instruct    |
 
-Our model weights and code are licensed for both researchers and commercial entities. The Databricks Open Source License can be found at [LICENSE](https://github.com/databricks/dbrx/LICENSE), and our Acceptable Use Policy can be found [here](https://www.databricks.com/legal/acceptable-use-policy-open-model).
+Our model weights and code are licensed for both researchers and commercial entities. The Databricks Open Source License can be found at [LICENSE](https://github.com/databricks/dbrx/blob/main/LICENSE), and our Acceptable Use Policy can be found [here](https://www.databricks.com/legal/acceptable-use-policy-open-model).
 
 For more information about the DBRX models, see https://github.com/databricks/dbrx.
 
@@ -309,10 +309,15 @@ dependencies = [
     "llm-foundry",
 ]
 
+# Note: Even though in python code, this would be llmfoundry.registry.loggers,
+# when specified in the entry_points, it has to be "llmfoundry_loggers". That is,
+# the segments of the name should be joined by an _ in the entry_points section.
 [project.entry-points."llmfoundry_loggers"]
 my_logger = "foundry_registry.loggers:MyLogger"
 ```
 
+If developing new components via entrypoints, it is important to note that Python entrypoints are global to the Python environment. This means that if you have multiple packages that register components with the same key, the last one installed will be the one used. This can be useful for overriding components in LLM Foundry, but can also lead to unexpected behavior if not careful. Additionally, if you change the pyproject.toml, you will need to reinstall the package for the changes to take effect. You can do this quickly by installing with `pip install -e . --no-deps` to avoid reinstalling dependencies.
+
 ### Direct call to register
 
 You can also register a component directly in your code:

diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
@@ -50,6 +50,7 @@
     tokenizers,
     utils,
 )
+from llmfoundry._version import __version__
 from llmfoundry.data import StreamingFinetuningDataset, StreamingTextDataset
 from llmfoundry.eval import InContextLearningDataset, InContextLearningMetric
 from llmfoundry.models.hf import ComposerHFCausalLM
@@ -63,6 +64,7 @@
 from llmfoundry.optim import DecoupledLionW
 
 __all__ = [
+    '__version__',
     'StreamingFinetuningDataset',
     'StreamingTextDataset',
     'InContextLearningDataset',
@@ -87,5 +89,3 @@
     'tokenizers',
     'utils',
 ]
-
-__version__ = '0.11.0.dev0'
diff --git a/llmfoundry/_version.py b/llmfoundry/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""The LLM Foundry Version."""
+
+__version__ = '0.11.0.dev'
diff --git a/llmfoundry/callbacks/async_eval_callback.py b/llmfoundry/callbacks/async_eval_callback.py
@@ -557,7 +557,8 @@ def launch_run(self, checkpoint: str, current_interval: Time) -> Run:
                 installation_path = i['path']
 
         if not found_llm_foundry:
-            from llmfoundry import __version__ as latest_foundry_version
+            from llmfoundry._version import \
+                __version__ as latest_foundry_version
 
             # If github integration is not found, foundry is likely installed
             # through the run command. In this case, we'll add the integration

diff --git a/llmfoundry/callbacks/curriculum_learning_callback.py b/llmfoundry/callbacks/curriculum_learning_callback.py
@@ -128,18 +128,17 @@ def after_load(self, state: State, logger: Logger):
         self._validate_dataloader(state.train_dataloader)
 
         # If checkpoint was saved before iteration was incremented, we need to increment it now
+        duration = self._schedule[self._schedule_index]['duration']
         if ((
-            self._schedule[self._schedule_index]['duration'].unit
-            == TimeUnit.TOKEN and state.timestamp.token_in_iteration >=
-            self._schedule[self._schedule_index]['duration'].value
+            duration.unit == TimeUnit.TOKEN and
+            state.timestamp.token_in_iteration >= duration.value
         ) or (
-            self._schedule[self._schedule_index]['duration'].unit
-            == TimeUnit.EPOCH and state.timestamp.epoch_in_iteration >=
-            self._schedule[self._schedule_index]['duration'].value
+            duration.unit == TimeUnit.EPOCH and
+            state.timestamp.epoch_in_iteration >= duration.value
         )):
             log.warning((
-                'The CurriculumLearning callback has detected that the previous run did not correctly '
-                'increment the iteration.'
+                'The CurriculumLearning callback has detected that the '
+                'previous run did not correctly increment the iteration.'
             ))
             self._schedule_index += 1
             state.timestamp = state.timestamp.to_next_iteration()
@@ -199,24 +198,13 @@ def load_state_dict(self, state: dict[str, Any]):
                 f'Expected {saved_loader} but got {current_loader}',
             ))
 
-        # Ensure that the current datamix duration is greater than timestamp
+        # Ensure that the current datamix duration is in the correct units
         duration = self._schedule[self._schedule_index]['duration']
         if duration.unit != TimeUnit.TOKEN and duration.unit != TimeUnit.EPOCH:
             raise ValueError((
                 f'Duration must be in terms of tokens or epochs, but got ',
                 f'{duration.unit}.',
             ))
-        if ((
-            duration.unit == TimeUnit.TOKEN and
-            duration > state['timestamp'].token_in_iteration
-        ) or (
-            duration.unit == TimeUnit.EPOCH and
-            duration > state['timestamp'].epoch_in_iteration
-        )):
-            raise ValueError((
-                'The duration of the current datamix must be less or equal to '
-                'than the saved timestamp.'
-            ))
 
     def _build_train_loader(
         self,