Skip to content

Commit

Permalink
Merge branch 'main' into mlflow-log-model
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg authored Oct 11, 2023
2 parents 6369b7b + cdb1c28 commit a36f93b
Show file tree
Hide file tree
Showing 19 changed files with 1,173 additions and 35 deletions.
41 changes: 36 additions & 5 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ on:
push:
branches:
- main
pull_request:
branches:
- main
paths:
- ./Dockerfile
- .github/workflows/docker.yaml
workflow_dispatch: {}
jobs:
docker-build:
Expand All @@ -13,10 +19,16 @@ jobs:
include:
- name: '1.13.1_cu117'
base_image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
dep_groups: '[gpu]'
- name: '2.0.1_cu118'
base_image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
dep_groups: '[gpu]'
- name: '2.1.0_cu121'
base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
dep_groups: '[gpu]'
- name: '2.1.0_cu121_flash2'
base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
dep_groups: '[gpu-flash2]'

steps:
- name: Maximize Build Space on Worker
Expand Down Expand Up @@ -52,13 +64,32 @@ jobs:
GIT_SHA=$(echo ${{ github.sha }} | cut -c1-7)
echo "IMAGE_TAG=${GIT_SHA}" >> ${GITHUB_ENV}
if [ "${{ github.event_name }}" == "push" ]; then
echo "Triggered by push event."
PROD_REPO="mosaicml/llm-foundry"
IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest"
IMAGE_CACHE="${PROD_REPO}:${{matrix.name}}-buildcache"
elif [ "${{ github.event_name }}" == "pull_request" ]; then
echo "Triggered by pull_request event."
STAGING_REPO="mosaicml/ci-staging"
IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA}"
IMAGE_CACHE="${STAGING_REPO}:${{matrix.name}}-buildcache"
else
echo "Triggered by unknown event: ${{ github.event_name }}"
exit 1
fi
echo "IMAGE_TAG=${IMAGE_TAG}" >> ${GITHUB_ENV}
echo "IMAGE_CACHE=${IMAGE_CACHE}" >> ${GITHUB_ENV}
- name: Build and Push the Docker Image
uses: docker/build-push-action@v3
with:
context: .
tags: mosaicml/llm-foundry:${{ matrix.name }}-latest,
mosaicml/llm-foundry:${{ matrix.name }}-${{ env.IMAGE_TAG }}
tags: ${{ env.IMAGE_TAG }}
push: true
cache-from: type=registry,ref=mosaicml/llm-foundry:${{ matrix.name }}-buildcache
cache-to: type=registry,ref=mosaicml/llm-foundry:${{ matrix.name }}-buildcache,mode=max
build-args: BASE_IMAGE=${{ matrix.base_image }}
cache-from: type=registry,ref=${{ env.IMAGE_CACHE }}
cache-to: type=registry,ref=${{ env.IMAGE_CACHE }},mode=max
build-args: |
BASE_IMAGE=${{ matrix.base_image }}
DEP_GROUPS=${{ matrix.dep_groups }}
5 changes: 5 additions & 0 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ jobs:
uses: ./.github/workflows/pytest-gpu.yaml
strategy:
matrix:
# TODO: After the PR with the flash attention 2 images goes in, add the new unit test suite
include:
- name: 'gpu-latest'
container: mosaicml/pytorch:latest # mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
Expand All @@ -31,6 +32,10 @@ jobs:
container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
markers: 'gpu'
pytest_command: 'coverage run -m pytest'
- name: 'gpu-2.1.0-flash2'
container: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
markers: 'gpu'
pytest_command: 'coverage run -m pytest'
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
Expand Down
9 changes: 5 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
ARG BASE_IMAGE
FROM $BASE_IMAGE

ARG DEP_GROUPS

# Install and uninstall foundry to cache foundry requirements
RUN git clone -b main https://github.com/mosaicml/llm-foundry.git && \
pip install --no-cache-dir "./llm-foundry[gpu]" && \
pip uninstall -y llm-foundry && \
rm -rf llm-foundry
RUN git clone -b main https://github.com/mosaicml/llm-foundry.git
RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}"
RUN pip uninstall -y llm-foundry
RUN rm -rf llm-foundry
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,10 @@ If you have success/failure using LLM Foundry on other systems, please let us kn
|---------------------------|------------------|--------------|-------------------------------|
| A100-40GB/80GB | 1.13.1 | 11.7 | :white_check_mark: Supported |
| A100-40GB/80GB | 2.0.1 | 11.7, 11.8 | :white_check_mark: Supported |
| A100-40GB/80GB | 2.1.0 | 11.8, 12.1 | :white_check_mark: Supported |
| H100-80GB | 1.13.1 | 11.7 | :x: Not Supported |
| H100-80GB | 2.0.1 | 11.8 | :white_check_mark: Supported |
| H100-80GB | 2.1.0 | 12.1 | :white_check_mark: Supported |
| A10-24GB | 1.13.1 | 11.7 | :construction: In Progress |
| A10-24GB | 2.0.1 | 11.7, 11.8 | :construction: In Progress |
| MI250 | 2.0.1 | ROCm 5.4 | :construction: In Progress |
Expand All @@ -113,8 +115,11 @@ You can select a specific commit hash such as `mosaicml/llm-foundry:1.13.1_cu117
|-------------------------------------------------------------|----------------|--------------|-------------------------------------|
| `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | 1.13.1 | 11.7 | No |
| `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04` | 2.0.1 | 11.8 | No |
| `mosaicml/pytorch:2.0.1_cu121-python3.10-ubuntu20.04` | 2.1.0 | 12.1 | No |
| `mosaicml/llm-foundry:1.13.1_cu117-latest` | 1.13.1 | 11.7 | Yes |
| `mosaicml/llm-foundry:2.0.1_cu118-latest` | 2.0.1 | 11.8 | Yes |
| `mosaicml/llm-foundry:2.1.0_cu121-latest` | 2.1.0 | 12.1 | Yes (flash attention v1) |
| `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest` | 2.1.0 | 12.1 | Yes (flash attention v2) |


# Installation
Expand Down
62 changes: 48 additions & 14 deletions llmfoundry/models/layers/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,22 @@
from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY


def is_flash_v2_installed():
try:
import flash_attn as flash_attn
except:
return False
return version.parse(flash_attn.__version__) >= version.parse('2.0.0')


def is_flash_v1_installed():
try:
import flash_attn as flash_attn
except:
return False
return version.parse(flash_attn.__version__) < version.parse('2.0.0')


def _reset_is_causal(num_query_tokens: int, num_key_tokens: int,
original_is_causal: bool) -> bool:
# disable causal when it is not needed
Expand Down Expand Up @@ -197,7 +213,8 @@ def flash_attn_fn(
try:
from flash_attn import bert_padding, flash_attn_interface # type: ignore # yapf: disable # isort: skip
except:
raise RuntimeError('Please install flash-attn==1.0.3.post0')
raise RuntimeError(
'Please install flash-attn==1.0.9 or flash-attn==2.3.2')

check_valid_inputs(query, key, value)

Expand Down Expand Up @@ -278,18 +295,35 @@ def flash_attn_fn(

reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)

output_unpad = flash_attn_interface.flash_attn_unpadded_func(
query_unpad,
key_unpad,
value_unpad,
cu_seqlens_q,
cu_seqlens_k,
max_seqlen_q,
max_seqlen_k,
dropout_p,
softmax_scale=softmax_scale,
causal=reset_is_causal,
return_attn_probs=needs_weights)
if is_flash_v1_installed():
output_unpad = flash_attn_interface.flash_attn_unpadded_func(
q=query_unpad,
k=key_unpad,
v=value_unpad,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_q,
max_seqlen_k=max_seqlen_k,
dropout_p=dropout_p,
softmax_scale=softmax_scale,
causal=reset_is_causal,
return_attn_probs=needs_weights)
elif is_flash_v2_installed():
output_unpad = flash_attn_interface.flash_attn_varlen_func(
q=query_unpad,
k=key_unpad,
v=value_unpad,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_q,
max_seqlen_k=max_seqlen_k,
dropout_p=dropout_p,
softmax_scale=softmax_scale,
causal=reset_is_causal,
return_attn_probs=needs_weights)
else:
raise RuntimeError(
'flash-attn==1.0.9 or flash-attn==2.3.2 is required.')

output = bert_padding.pad_input(
rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size,
Expand Down Expand Up @@ -321,7 +355,7 @@ def triton_flash_attn_fn(
if version.parse(torch.__version__) < version.parse('2.0.0'):
_installed = True
# if torch1.13.1 revert to using triton flash attn from HazyResearch
# with flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202
# with flash-attn==1.0.9 and triton==2.0.0.dev20221202
try:
from flash_attn.flash_attn_triton import flash_attn_func
except:
Expand Down
153 changes: 153 additions & 0 deletions llmfoundry/optim/scheduler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

"""Experimental learning rate schedulers used for training LLMs."""

import textwrap
import warnings
from typing import Union

from composer.core import State, Time, TimeUnit
from composer.optim import ComposerScheduler, LinearScheduler
from composer.optim.scheduler import _convert_time

__all__ = ['InverseSquareRootWithWarmupScheduler']


def _raise_if_units_dont_match(time: Union[str, Time], t_max: Union[str, Time],
name: str) -> None:
if isinstance(time, str):
time = Time.from_timestring(time)
if isinstance(t_max, str):
t_max = Time.from_timestring(t_max)
if time.unit != t_max.unit:
raise ValueError(f'{time.unit=} does not match {t_max.unit=}.')


def _raise_if_units_dur(time: Union[str, Time], name: str) -> None:
if isinstance(time, str):
time = Time.from_timestring(time)
if time.unit == TimeUnit('dur'):
raise ValueError(f'{name} cannot be in units of "dur".')


class InverseSquareRootWithWarmupScheduler(ComposerScheduler):
r"""Inverse square root LR decay with warmup and optional linear cooldown.
Specifically, the learning rate multiplier :math:`\alpha(t)` can be expressed as:
.. math::
\alpha(t) = \begin{cases}
t / t_{warmup}, & \text{if } t < t_{warmup} \\
\alpha_{f,decay} + \frac{1 - \alpha_{f,decay}}{\sqrt{\tau_d}}, & \text{if } t_{warmup} <= t < t_{max} - t_{cooldown} \\
\alpha_i + (alpha_{f,cooldown} - \alpha_i) \times \tau_c, & \text{otherwise}
\end{cases}
Given :math:`\tau_d`, the time elapsed during the inverse square root decay (normalized by :math:`t_scale`), as:
.. math::
\tau_d = (t - t_{warmup} + t_{scale}) / {t_scale}
:math:`\alpha_i` as the value of the learning rate multiplier when :math:`\tau_d` is evaluated at :math:`t = t_{max} - t_{cooldown}`,
and :math:`\tau_c`, the fraction of linear cooldown time elapsed (clipped to the interval :math:`[0, 1]`), as:
.. math::
\tau_c = (t - t_{max} + t_{cooldown}) / t_{cooldown}
Where :math:`t_{warmup}` represents the warmup time, :math:`t_{scale}` represents the time scale,
:math:`t_{cooldown}` represents the cooldown time, :math:`t_{max}` represents the duration of this scheduler,
:math:`\alpha_{f,decay}` represents the learning rate multiplier that the inverse square root decays to at infinite time,
and :math:`\alpha_{f,cooldown}` represents the learning rate multiplier that the linear cooldown decays to.
Note, :math:`\alpha_{f,decay} >= \alpha_{f,cooldown}` to ensure that the learning rate is monotonically decreasing after warmup.
Also note, ``t_warmup``, ``t_scale``, and ``t_cooldown`` cannot be specified in units of duration; since this schedule is designed for continual learning,
``max_duration`` is expected to change. Instead, these parameters need to be specified in the same units as ``max_duration`` passed to the trainer.
Args:
t_warmup (str | Time): The warmup time.
t_scale (str | Time): The time scale.
t_cooldown (str | Time): The cooldown time.
t_max (str | Time): The duration of this scheduler. Default = ``"1dur"``.
alpha_f_decay (float): The learning rate multiplier to decay inverse square root decay to. Default = ``0.0``.
alpha_f_cooldown (float): The learning rate multiplier to decay linear cooldown to. Default = ``0.0``.
"""

def __init__(self,
t_warmup: Union[str, Time],
t_scale: Union[str, Time],
t_cooldown: Union[str, Time],
t_max: Union[str, Time] = '1dur',
alpha_f_decay: float = 0.0,
alpha_f_cooldown: float = 0.0) -> None:
if alpha_f_decay < alpha_f_cooldown:
raise ValueError(('Required: alpha_f_decay >= alpha_f_cooldown. '
f'Current: alpha_f_decay={alpha_f_decay}, '
f'alpha_f_cooldown={alpha_f_cooldown}.'))
_raise_if_units_dur(t_warmup, 't_warmup')
_raise_if_units_dur(t_scale, 't_scale')
_raise_if_units_dur(t_cooldown, 't_cooldown')
self.t_warmup = t_warmup
self.t_scale = t_scale
self.t_cooldown = t_cooldown
self.t_max = t_max
self.alpha_f_decay = alpha_f_decay
self.alpha_f_cooldown = alpha_f_cooldown
self.warmup_scheduler = LinearScheduler(alpha_i=0.0,
alpha_f=1.0,
t_max=t_warmup)

def __call__(self, state: State, ssr: float = 1.0) -> float:
assert state.max_duration is not None, 'max_duration should be set whenever schedulers are invoked'
_raise_if_units_dont_match(self.t_warmup, state.max_duration,
't_warmup')
_raise_if_units_dont_match(self.t_scale, state.max_duration, 't_scale')
_raise_if_units_dont_match(self.t_cooldown, state.max_duration,
't_cooldown')

t_warmup = _convert_time(self.t_warmup, state)
if t_warmup.value == 0:
warnings.warn(
textwrap.dedent("""\
The warmup duration is 0. If warmup was specified as a fraction of the total
training duration, the warmup duration is calculated in the
same unit as the trainer's max_duration parameter."""))

if state.timestamp < t_warmup:
return self.warmup_scheduler(state)

t_scale = _convert_time(self.t_scale, state, ssr=ssr)
t_cooldown = _convert_time(self.t_cooldown, state, ssr=ssr)
t_max = _convert_time(self.t_max, state, ssr=ssr)
current_time = state.timestamp.get(t_scale.unit)

t_shift = t_scale - t_warmup
# t_cooldown_start is max of t_warmup, t_max - t_cooldown
t_cooldown_start = t_max - t_cooldown
if t_cooldown_start < t_warmup:
t_cooldown_start = t_warmup

if state.timestamp < t_cooldown_start:
# Rescale LR by a coefficient equal to the inverse square root of the time
# elapsed after warmup, rescaled by the time scale, such that, at
# infinite time, the LR decays to alpha_f_decay.
coeff = 1 / ((current_time + t_shift) / t_scale).value**0.5
current_factor = (self.alpha_f_decay + coeff *
(1.0 - self.alpha_f_decay))
return current_factor

else:
coeff = 1 / ((t_cooldown_start + t_shift) / t_scale).value**0.5
alpha_i = self.alpha_f_decay + coeff * (1.0 - self.alpha_f_decay)

if t_cooldown.value == 0:
return alpha_i

# Linearly decay the LR from its value at the step at which cooldown
# started to alpha_f_cooldown over t_cooldown time.
frac_of_cooldown = ((current_time - t_cooldown_start) /
t_cooldown).value
frac_of_cooldown = min(1.0, frac_of_cooldown)
current_factor = (alpha_i + frac_of_cooldown *
(self.alpha_f_cooldown - alpha_i))
return current_factor
3 changes: 3 additions & 0 deletions llmfoundry/utils/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
ScheduledGarbageCollector)
from llmfoundry.optim import (DecoupledAdaLRLion, DecoupledClipLion,
DecoupledLionW, DecoupledLionW_8bit)
from llmfoundry.optim.scheduler import InverseSquareRootWithWarmupScheduler
from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -158,6 +159,8 @@ def build_scheduler(name: str,
return ConstantWithWarmupScheduler(**scheduler_config)
elif name == 'cosine_with_warmup':
return CosineAnnealingWithWarmupScheduler(**scheduler_config)
elif name == 'inv_sqrt_with_warmup':
return InverseSquareRootWithWarmupScheduler(**scheduler_config)
elif name == 'linear_decay_with_warmup':
return LinearWithWarmupScheduler(**scheduler_config)
else:
Expand Down
Loading

0 comments on commit a36f93b

Please sign in to comment.