Merge branch 'main' into mlflow-log-model

mosaicml · Oct 11, 2023 · a36f93b · a36f93b
2 parents 6369b7b + cdb1c28
commit a36f93b
Show file tree

Hide file tree

Showing 19 changed files with 1,173 additions and 35 deletions.
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -3,6 +3,12 @@ on:
   push:
     branches:
     - main
+  pull_request:
+    branches:
+    - main
+    paths:
+    - ./Dockerfile
+    - .github/workflows/docker.yaml
   workflow_dispatch: {}
 jobs:
   docker-build:
@@ -13,10 +19,16 @@ jobs:
         include:
         - name: '1.13.1_cu117'
           base_image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+          dep_groups: '[gpu]'
         - name: '2.0.1_cu118'
           base_image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
+          dep_groups: '[gpu]'
         - name: '2.1.0_cu121'
           base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
+          dep_groups: '[gpu]'
+        - name: '2.1.0_cu121_flash2'
+          base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
+          dep_groups: '[gpu-flash2]'
 
     steps:
     - name: Maximize Build Space on Worker
@@ -52,13 +64,32 @@ jobs:
         GIT_SHA=$(echo ${{ github.sha }} | cut -c1-7)
         echo "IMAGE_TAG=${GIT_SHA}" >> ${GITHUB_ENV}
 
+        if [ "${{ github.event_name }}" == "push" ]; then
+          echo "Triggered by push event."
+          PROD_REPO="mosaicml/llm-foundry"
+          IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest"
+          IMAGE_CACHE="${PROD_REPO}:${{matrix.name}}-buildcache"
+        elif [ "${{ github.event_name }}" == "pull_request" ]; then
+          echo "Triggered by pull_request event."
+          STAGING_REPO="mosaicml/ci-staging"
+          IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA}"
+          IMAGE_CACHE="${STAGING_REPO}:${{matrix.name}}-buildcache"
+        else
+          echo "Triggered by unknown event: ${{ github.event_name }}"
+          exit 1
+        fi
+
+        echo "IMAGE_TAG=${IMAGE_TAG}" >> ${GITHUB_ENV}
+        echo "IMAGE_CACHE=${IMAGE_CACHE}" >> ${GITHUB_ENV}
+
     - name: Build and Push the Docker Image
       uses: docker/build-push-action@v3
       with:
         context: .
-        tags: mosaicml/llm-foundry:${{ matrix.name }}-latest,
-          mosaicml/llm-foundry:${{ matrix.name }}-${{ env.IMAGE_TAG }}
+        tags: ${{ env.IMAGE_TAG }}
         push: true
-        cache-from: type=registry,ref=mosaicml/llm-foundry:${{ matrix.name }}-buildcache
-        cache-to: type=registry,ref=mosaicml/llm-foundry:${{ matrix.name }}-buildcache,mode=max
-        build-args: BASE_IMAGE=${{ matrix.base_image }}
+        cache-from: type=registry,ref=${{ env.IMAGE_CACHE }}
+        cache-to: type=registry,ref=${{ env.IMAGE_CACHE }},mode=max
+        build-args: |
+          BASE_IMAGE=${{ matrix.base_image }}
+          DEP_GROUPS=${{ matrix.dep_groups }}
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -18,6 +18,7 @@ jobs:
     uses: ./.github/workflows/pytest-gpu.yaml
     strategy:
       matrix:
+        # TODO: After the PR with the flash attention 2 images goes in, add the new unit test suite
         include:
         - name: 'gpu-latest'
           container: mosaicml/pytorch:latest  # mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
@@ -31,6 +32,10 @@ jobs:
           container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
           markers: 'gpu'
           pytest_command: 'coverage run -m pytest'
+        - name: 'gpu-2.1.0-flash2'
+          container: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
+          markers: 'gpu'
+          pytest_command: 'coverage run -m pytest'
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:

diff --git a/Dockerfile b/Dockerfile
@@ -4,9 +4,10 @@
 ARG BASE_IMAGE
 FROM $BASE_IMAGE
 
+ARG DEP_GROUPS
 
 # Install and uninstall foundry to cache foundry requirements
-RUN git clone -b main https://github.com/mosaicml/llm-foundry.git && \
-    pip install --no-cache-dir "./llm-foundry[gpu]" && \
-    pip uninstall -y llm-foundry && \
-    rm -rf llm-foundry
+RUN git clone -b main https://github.com/mosaicml/llm-foundry.git
+RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}"
+RUN pip uninstall -y llm-foundry
+RUN rm -rf llm-foundry
diff --git a/README.md b/README.md
@@ -93,8 +93,10 @@ If you have success/failure using LLM Foundry on other systems, please let us kn
 |---------------------------|------------------|--------------|-------------------------------|
 | A100-40GB/80GB            | 1.13.1           | 11.7         | :white_check_mark: Supported  |
 | A100-40GB/80GB            | 2.0.1            | 11.7, 11.8   | :white_check_mark: Supported  |
+| A100-40GB/80GB            | 2.1.0            | 11.8, 12.1   | :white_check_mark: Supported  |
 | H100-80GB                 | 1.13.1           | 11.7         | :x: Not Supported             |
 | H100-80GB                 | 2.0.1            | 11.8         | :white_check_mark: Supported  |
+| H100-80GB                 | 2.1.0            | 12.1         | :white_check_mark: Supported  |
 | A10-24GB                  | 1.13.1           | 11.7         | :construction: In Progress    |
 | A10-24GB                  | 2.0.1            | 11.7, 11.8   | :construction: In Progress    |
 | MI250                     | 2.0.1            | ROCm 5.4     | :construction: In Progress    |
@@ -113,8 +115,11 @@ You can select a specific commit hash such as `mosaicml/llm-foundry:1.13.1_cu117
 |-------------------------------------------------------------|----------------|--------------|-------------------------------------|
 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04`      | 1.13.1         | 11.7         | No                                  |
 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04`       | 2.0.1          | 11.8         | No                                  |
+| `mosaicml/pytorch:2.0.1_cu121-python3.10-ubuntu20.04`       | 2.1.0          | 12.1         | No                                  |
 | `mosaicml/llm-foundry:1.13.1_cu117-latest`                  | 1.13.1         | 11.7         | Yes                                 |
 | `mosaicml/llm-foundry:2.0.1_cu118-latest`                   | 2.0.1          | 11.8         | Yes                                 |
+| `mosaicml/llm-foundry:2.1.0_cu121-latest`                   | 2.1.0          | 12.1         | Yes (flash attention v1)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest`            | 2.1.0          | 12.1         | Yes (flash attention v2)            |
 
 
 # Installation

diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
@@ -17,6 +17,22 @@
 from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
 
 
+def is_flash_v2_installed():
+    try:
+        import flash_attn as flash_attn
+    except:
+        return False
+    return version.parse(flash_attn.__version__) >= version.parse('2.0.0')
+
+
+def is_flash_v1_installed():
+    try:
+        import flash_attn as flash_attn
+    except:
+        return False
+    return version.parse(flash_attn.__version__) < version.parse('2.0.0')
+
+
 def _reset_is_causal(num_query_tokens: int, num_key_tokens: int,
                      original_is_causal: bool) -> bool:
     # disable causal when it is not needed
@@ -197,7 +213,8 @@ def flash_attn_fn(
     try:
         from flash_attn import bert_padding, flash_attn_interface  # type: ignore # yapf: disable # isort: skip
     except:
-        raise RuntimeError('Please install flash-attn==1.0.3.post0')
+        raise RuntimeError(
+            'Please install flash-attn==1.0.9 or flash-attn==2.3.2')
 
     check_valid_inputs(query, key, value)
 
@@ -278,18 +295,35 @@ def flash_attn_fn(
 
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
 
-    output_unpad = flash_attn_interface.flash_attn_unpadded_func(
-        query_unpad,
-        key_unpad,
-        value_unpad,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale=softmax_scale,
-        causal=reset_is_causal,
-        return_attn_probs=needs_weights)
+    if is_flash_v1_installed():
+        output_unpad = flash_attn_interface.flash_attn_unpadded_func(
+            q=query_unpad,
+            k=key_unpad,
+            v=value_unpad,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=reset_is_causal,
+            return_attn_probs=needs_weights)
+    elif is_flash_v2_installed():
+        output_unpad = flash_attn_interface.flash_attn_varlen_func(
+            q=query_unpad,
+            k=key_unpad,
+            v=value_unpad,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=reset_is_causal,
+            return_attn_probs=needs_weights)
+    else:
+        raise RuntimeError(
+            'flash-attn==1.0.9 or flash-attn==2.3.2 is required.')
 
     output = bert_padding.pad_input(
         rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size,
@@ -321,7 +355,7 @@ def triton_flash_attn_fn(
         if version.parse(torch.__version__) < version.parse('2.0.0'):
             _installed = True
             # if torch1.13.1 revert to using triton flash attn from HazyResearch
-            # with flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202
+            # with flash-attn==1.0.9 and triton==2.0.0.dev20221202
             try:
                 from flash_attn.flash_attn_triton import flash_attn_func
             except:

diff --git a/llmfoundry/optim/scheduler.py b/llmfoundry/optim/scheduler.py
@@ -0,0 +1,153 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Experimental learning rate schedulers used for training LLMs."""
+
+import textwrap
+import warnings
+from typing import Union
+
+from composer.core import State, Time, TimeUnit
+from composer.optim import ComposerScheduler, LinearScheduler
+from composer.optim.scheduler import _convert_time
+
+__all__ = ['InverseSquareRootWithWarmupScheduler']
+
+
+def _raise_if_units_dont_match(time: Union[str, Time], t_max: Union[str, Time],
+                               name: str) -> None:
+    if isinstance(time, str):
+        time = Time.from_timestring(time)
+    if isinstance(t_max, str):
+        t_max = Time.from_timestring(t_max)
+    if time.unit != t_max.unit:
+        raise ValueError(f'{time.unit=} does not match {t_max.unit=}.')
+
+
+def _raise_if_units_dur(time: Union[str, Time], name: str) -> None:
+    if isinstance(time, str):
+        time = Time.from_timestring(time)
+    if time.unit == TimeUnit('dur'):
+        raise ValueError(f'{name} cannot be in units of "dur".')
+
+
+class InverseSquareRootWithWarmupScheduler(ComposerScheduler):
+    r"""Inverse square root LR decay with warmup and optional linear cooldown.
+
+    Specifically, the learning rate multiplier :math:`\alpha(t)` can be expressed as:
+
+    .. math::
+        \alpha(t) = \begin{cases}
+            t / t_{warmup}, & \text{if } t < t_{warmup} \\
+            \alpha_{f,decay} + \frac{1 - \alpha_{f,decay}}{\sqrt{\tau_d}}, & \text{if } t_{warmup} <= t < t_{max} - t_{cooldown} \\
+            \alpha_i + (alpha_{f,cooldown} - \alpha_i) \times \tau_c, & \text{otherwise}
+        \end{cases}
+
+    Given :math:`\tau_d`, the time elapsed during the inverse square root decay (normalized by :math:`t_scale`), as:
+
+    .. math::
+        \tau_d = (t - t_{warmup} + t_{scale}) / {t_scale}
+
+    :math:`\alpha_i` as the value of the learning rate multiplier when :math:`\tau_d` is evaluated at :math:`t = t_{max} - t_{cooldown}`,
+    and :math:`\tau_c`, the fraction of linear cooldown time elapsed (clipped to the interval :math:`[0, 1]`), as:
+
+    .. math::
+        \tau_c = (t - t_{max} + t_{cooldown}) / t_{cooldown}
+
+    Where :math:`t_{warmup}` represents the warmup time, :math:`t_{scale}` represents the time scale,
+    :math:`t_{cooldown}` represents the cooldown time, :math:`t_{max}` represents the duration of this scheduler,
+    :math:`\alpha_{f,decay}` represents the learning rate multiplier that the inverse square root decays to at infinite time,
+    and :math:`\alpha_{f,cooldown}` represents the learning rate multiplier that the linear cooldown decays to.
+
+    Note, :math:`\alpha_{f,decay} >= \alpha_{f,cooldown}` to ensure that the learning rate is monotonically decreasing after warmup.
+
+    Also note, ``t_warmup``, ``t_scale``, and ``t_cooldown`` cannot be specified in units of duration; since this schedule is designed for continual learning,
+    ``max_duration`` is expected to change. Instead, these parameters need to be specified in the same units as ``max_duration`` passed to the trainer.
+
+    Args:
+        t_warmup (str | Time): The warmup time.
+        t_scale (str | Time): The time scale.
+        t_cooldown (str | Time): The cooldown time.
+        t_max (str | Time): The duration of this scheduler. Default = ``"1dur"``.
+        alpha_f_decay (float): The learning rate multiplier to decay inverse square root decay to. Default = ``0.0``.
+        alpha_f_cooldown (float): The learning rate multiplier to decay linear cooldown to. Default = ``0.0``.
+    """
+
+    def __init__(self,
+                 t_warmup: Union[str, Time],
+                 t_scale: Union[str, Time],
+                 t_cooldown: Union[str, Time],
+                 t_max: Union[str, Time] = '1dur',
+                 alpha_f_decay: float = 0.0,
+                 alpha_f_cooldown: float = 0.0) -> None:
+        if alpha_f_decay < alpha_f_cooldown:
+            raise ValueError(('Required: alpha_f_decay >= alpha_f_cooldown. '
+                              f'Current: alpha_f_decay={alpha_f_decay}, '
+                              f'alpha_f_cooldown={alpha_f_cooldown}.'))
+        _raise_if_units_dur(t_warmup, 't_warmup')
+        _raise_if_units_dur(t_scale, 't_scale')
+        _raise_if_units_dur(t_cooldown, 't_cooldown')
+        self.t_warmup = t_warmup
+        self.t_scale = t_scale
+        self.t_cooldown = t_cooldown
+        self.t_max = t_max
+        self.alpha_f_decay = alpha_f_decay
+        self.alpha_f_cooldown = alpha_f_cooldown
+        self.warmup_scheduler = LinearScheduler(alpha_i=0.0,
+                                                alpha_f=1.0,
+                                                t_max=t_warmup)
+
+    def __call__(self, state: State, ssr: float = 1.0) -> float:
+        assert state.max_duration is not None, 'max_duration should be set whenever schedulers are invoked'
+        _raise_if_units_dont_match(self.t_warmup, state.max_duration,
+                                   't_warmup')
+        _raise_if_units_dont_match(self.t_scale, state.max_duration, 't_scale')
+        _raise_if_units_dont_match(self.t_cooldown, state.max_duration,
+                                   't_cooldown')
+
+        t_warmup = _convert_time(self.t_warmup, state)
+        if t_warmup.value == 0:
+            warnings.warn(
+                textwrap.dedent("""\
+                The warmup duration is 0. If warmup was specified as a fraction of the total
+                training duration, the warmup duration is calculated in the
+                same unit as the trainer's max_duration parameter."""))
+
+        if state.timestamp < t_warmup:
+            return self.warmup_scheduler(state)
+
+        t_scale = _convert_time(self.t_scale, state, ssr=ssr)
+        t_cooldown = _convert_time(self.t_cooldown, state, ssr=ssr)
+        t_max = _convert_time(self.t_max, state, ssr=ssr)
+        current_time = state.timestamp.get(t_scale.unit)
+
+        t_shift = t_scale - t_warmup
+        # t_cooldown_start is max of t_warmup, t_max - t_cooldown
+        t_cooldown_start = t_max - t_cooldown
+        if t_cooldown_start < t_warmup:
+            t_cooldown_start = t_warmup
+
+        if state.timestamp < t_cooldown_start:
+            # Rescale LR by a coefficient equal to the inverse square root of the time
+            # elapsed after warmup, rescaled by the time scale, such that, at
+            # infinite time, the LR decays to alpha_f_decay.
+            coeff = 1 / ((current_time + t_shift) / t_scale).value**0.5
+            current_factor = (self.alpha_f_decay + coeff *
+                              (1.0 - self.alpha_f_decay))
+            return current_factor
+
+        else:
+            coeff = 1 / ((t_cooldown_start + t_shift) / t_scale).value**0.5
+            alpha_i = self.alpha_f_decay + coeff * (1.0 - self.alpha_f_decay)
+
+            if t_cooldown.value == 0:
+                return alpha_i
+
+            # Linearly decay the LR from its value at the step at which cooldown
+            # started to alpha_f_cooldown over t_cooldown time.
+            frac_of_cooldown = ((current_time - t_cooldown_start) /
+                                t_cooldown).value
+            frac_of_cooldown = min(1.0, frac_of_cooldown)
+            current_factor = (alpha_i + frac_of_cooldown *
+                              (self.alpha_f_cooldown - alpha_i))
+            return current_factor
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
@@ -32,6 +32,7 @@
                                   ScheduledGarbageCollector)
 from llmfoundry.optim import (DecoupledAdaLRLion, DecoupledClipLion,
                               DecoupledLionW, DecoupledLionW_8bit)
+from llmfoundry.optim.scheduler import InverseSquareRootWithWarmupScheduler
 from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper
 
 log = logging.getLogger(__name__)
@@ -158,6 +159,8 @@ def build_scheduler(name: str,
         return ConstantWithWarmupScheduler(**scheduler_config)
     elif name == 'cosine_with_warmup':
         return CosineAnnealingWithWarmupScheduler(**scheduler_config)
+    elif name == 'inv_sqrt_with_warmup':
+        return InverseSquareRootWithWarmupScheduler(**scheduler_config)
     elif name == 'linear_decay_with_warmup':
         return LinearWithWarmupScheduler(**scheduler_config)
     else: