Skip to content

Commit

Permalink
Merge branch 'main' into EOS_token_checker
Browse files Browse the repository at this point in the history
  • Loading branch information
KuuCi authored Feb 1, 2024
2 parents 785f906 + 203edad commit 2cee526
Show file tree
Hide file tree
Showing 66 changed files with 274 additions and 236 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/code-quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ jobs:
strategy:
matrix:
python_version:
- '3.9'
- '3.10'
- "3.9"
- "3.10"
pip_deps:
- '[dev]'
- "[dev]"
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: 'CodeQL'
name: "CodeQL"

on:
push:
Expand All @@ -18,7 +18,7 @@ on:
# The branches below must be a subset of the branches above
branches: [main]
schedule:
- cron: '0 9 * * 1' # Every Monday at 09:00 (9:00 AM)
- cron: "0 9 * * 1" # Every Monday at 09:00 (9:00 AM)

jobs:
analyze:
Expand All @@ -32,7 +32,7 @@ jobs:
strategy:
fail-fast: false
matrix:
language: ['python']
language: ["python"]
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
# Learn more about CodeQL language support at https://git.io/codeql-language-support

Expand Down
17 changes: 9 additions & 8 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,18 @@ jobs:
strategy:
matrix:
include:
- name: '2.1.0_cu121'
- name: "2.1.0_cu121"
base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
dep_groups: '[gpu]'
- name: '2.1.0_cu121_flash2'
dep_groups: "[gpu]"
- name: "2.1.0_cu121_flash2"
base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
dep_groups: '[gpu-flash2]'
- name: '2.1.0_cu121_aws'
dep_groups: "[gpu-flash2]"
- name: "2.1.0_cu121_aws"
base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04-aws
dep_groups: '[gpu]'
- name: '2.1.0_cu121_flash2_aws'
dep_groups: "[gpu]"
- name: "2.1.0_cu121_flash2_aws"
base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04-aws
dep_groups: '[gpu-flash2]'
dep_groups: "[gpu-flash2]"
steps:
- name: Maximize Build Space on Worker
uses: easimon/maximize-build-space@v4
Expand Down Expand Up @@ -88,5 +88,6 @@ jobs:
cache-from: type=registry,ref=${{ env.IMAGE_CACHE }}
cache-to: type=registry,ref=${{ env.IMAGE_CACHE }},mode=max
build-args: |
BRANCH_NAME=${{ github.head_ref || github.ref_name }}
BASE_IMAGE=${{ matrix.base_image }}
DEP_GROUPS=${{ matrix.dep_groups }}
6 changes: 3 additions & 3 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ jobs:
strategy:
matrix:
include:
- name: 'cpu-2.1.0'
- name: "cpu-2.1.0"
container: mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04
markers: 'not gpu'
pytest_command: 'coverage run -m pytest'
markers: "not gpu"
pytest_command: "coverage run -m pytest"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
Expand Down
16 changes: 8 additions & 8 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@ jobs:
strategy:
matrix:
include:
- name: 'gpu-2.1.0'
- name: "gpu-2.1.0"
container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
markers: 'gpu'
pytest_command: 'coverage run -m pytest'
deps_group: 'all'
- name: 'gpu-2.1.0-flash2'
markers: "gpu"
pytest_command: "coverage run -m pytest"
deps_group: "all"
- name: "gpu-2.1.0-flash2"
container: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
markers: 'gpu'
pytest_command: 'coverage run -m pytest'
deps_group: 'all-flash2'
markers: "gpu"
pytest_command: "coverage run -m pytest"
deps_group: "all-flash2"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pytest-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ on:
required: true
jobs:
pytest-gpu:
timeout-minutes: 60 # ${{ inputs.gha-timeout }} for some reason not able to turn this into an input
timeout-minutes: 60 # ${{ inputs.gha-timeout }} for some reason not able to turn this into an input
runs-on: ubuntu-latest
env:
MOSAICML_API_KEY: ${{ secrets.mcloud-api-key }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Release
on:
push:
tags:
- 'v*'
- "v*"
workflow_dispatch:

jobs:
Expand All @@ -22,7 +22,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.9'
python-version: "3.9"

- name: Build source and wheel distributions
run: |
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ repos:
- id: insert-license
args:
- --license-filepath
- .ci/FILE_HEADER
- .pre-commit/FILE_HEADER
- --comment-style
- '#'
- "#"
- --allow-past-years
types: [python]
- repo: https://github.com/PyCQA/docformatter
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion .yamllint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ yaml-files:

ignore: |
wandb
*
rules:
braces:
Expand All @@ -30,6 +29,7 @@ rules:
key-duplicates: enable
key-ordering: disable
line-length:
max: 120
allow-non-breakable-words: true
allow-non-breakable-inline-mappings: true
new-line-at-end-of-file: enable
Expand Down
8 changes: 7 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@
ARG BASE_IMAGE
FROM $BASE_IMAGE

ARG BRANCH_NAME
ARG DEP_GROUPS

# Check for changes in setup.py.
# If there are changes, the docker cache is invalidated and a fresh pip installation is triggered.
ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py setup.py
RUN rm setup.py

# Install and uninstall foundry to cache foundry requirements
RUN git clone -b main https://github.com/mosaicml/llm-foundry.git
RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git
RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}"
RUN pip uninstall -y llm-foundry
RUN rm -rf llm-foundry
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,10 @@ You can select a specific commit hash such as `mosaicml/llm-foundry:1.13.1_cu117
| Docker Image | Torch Version | Cuda Version | LLM Foundry dependencies installed? |
| ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- |
| `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04` | 2.1.0 | 12.1 (Infiniband) | No |
| `mosaicml/llm-foundry:2.1.0_cu121-latest` | 2.1.0 | 12.1 (Infiniband) | Yes (flash attention v1) |
| `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest` | 2.1.0 | 12.1 (Infiniband) | Yes (flash attention v2) |
| `mosaicml/llm-foundry:2.1.0_cu121_aws-latest` | 2.1.0 | 12.1 (EFA) | Yes (flash attention v1) |
| `mosaicml/llm-foundry:2.1.0_cu121_flash2_aws-latest` | 2.1.0 | 12.1 (EFA) | Yes (flash attention v2) |
| `mosaicml/llm-foundry:2.1.0_cu121-latest` | 2.1.0 | 12.1 (Infiniband) | Yes (flash attention v1. Warning: Support for flash attention v1 has been deprecated.) |
| `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest` | 2.1.0 | 12.1 (Infiniband) | Yes (flash attention v2. Note: We recommend using flash attention v2.) |
| `mosaicml/llm-foundry:2.1.0_cu121_aws-latest` | 2.1.0 | 12.1 (EFA) | Yes (flash attention v1. Warning: Support for flash attention v1 has been deprecated.) |
| `mosaicml/llm-foundry:2.1.0_cu121_flash2_aws-latest` | 2.1.0 | 12.1 (EFA) | Yes (flash attention v2. Note: We recommend using flash attention v2.) |


# Installation
Expand All @@ -134,7 +134,9 @@ We *strongly* recommend working with LLM Foundry inside a Docker container (see
```bash
git clone https://github.com/mosaicml/llm-foundry.git
cd llm-foundry
pip install -e ".[gpu]" # or pip install -e . if no NVIDIA GPU
pip install -e ".[gpu-flash2]" # or `pip install -e .` if no NVIDIA GPU.
# Note: Currently, `pip install -e ".[gpu-flash2]"` installs Flash Attention v2, and `pip install -e ".[gpu]"` installs Flash Attention v1.
# However, once the support for Flash Attention v1 is removed, both of these commands will install Flash Attention v2.
```

### Without Docker (not recommended)
Expand All @@ -152,7 +154,9 @@ source llmfoundry-venv/bin/activate

pip install cmake packaging torch # setup.py requires these be installed

pip install -e ".[gpu]" # or pip install -e . if no NVIDIA GPU
pip install -e ".[gpu-flash2]" # or `pip install -e .` if no NVIDIA GPU.
# Note: Currently, `pip install -e ".[gpu-flash2]"` installs Flash Attention v2, and `pip install -e ".[gpu]"` installs Flash Attention v1.
# However, once the support for Flash Attention v1 is removed, both of these commands will install Flash Attention v2.
```

### TransformerEngine and amp_fp8 support
Expand Down
16 changes: 8 additions & 8 deletions TUTORIAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ name = 'mosaicml/mpt-7b'
# Download config
config = AutoConfig.from_pretrained(name, trust_remote_code=True)
# (Optional) Use `triton` backend for fast attention. Defaults to `torch`.
# config.attn_config['attn_impl'] = 'triton'
# (Optional) Use `flash` (preferred) or `triton` backend for fast attention. Defaults to `torch`.
# config.attn_config['attn_impl'] = 'flash'
# (Optional) Change the `max_seq_len` allowed for inference
# config.max_seq_len = 4096

Expand Down Expand Up @@ -291,7 +291,7 @@ The purpose of this section is probably pretty self-evident. You’ve got questi
- If OOMs persist with `device_train_microbatch_size: 1` and `device_eval_batch_size: 1`, you may need to use activation checkpointing `fsdp_config.activation_checkpointing: true` (if you are not already) and, as a last resort, activation CPU offloading `fsdp_config.activation_cpu_offload: true`.

### What hardware can I train on?
- In general, this repo should work on any system with NVIDIA GPUs. Checkout the `scripts/train/README.md` for more [details on GPU memory requirements]([https://github.com/mosaicml/llm-foundry/tree/main/scripts/train#how-many-gpus-do-i-need-to-train-a-llm](https://github.com/mosaicml/llm-foundry/tree/main/scripts/train#how-many-gpus-do-i-need-to-train-a-llm)). Keep in mind you may run into issues with `Triton` support on some GPU types. In that situation, you can fall back to `attn_impl: torch` or raise an issue in the [Triton github repo](https://github.com/openai/triton).
- In general, this repo should work on any system with NVIDIA GPUs. Checkout the `scripts/train/README.md` for more [details on GPU memory requirements]([https://github.com/mosaicml/llm-foundry/tree/main/scripts/train#how-many-gpus-do-i-need-to-train-a-llm](https://github.com/mosaicml/llm-foundry/tree/main/scripts/train#how-many-gpus-do-i-need-to-train-a-llm)). We recommend using `Flash` attention instead of `Triton` attention, unless you're training Prefix Language Models (in which case use `Triton`). Keep in mind you may run into issues with `Flash` or `Triton` support on some GPU types. In that situation, you can fall back to `attn_impl: torch`, or raise an issue in the [Flash Attention github repo](https://github.com/Dao-AILab/flash-attention).

### What hardware can I run eval on?
- Similar to above…
Expand All @@ -305,15 +305,15 @@ The purpose of this section is probably pretty self-evident. You’ve got questi
### What are the different attention options `torch` / `flash` / `triton` for MPT and which one should I use?
- **Short answer:** `torch` is the native pytorch attention implementation, and `flash` and `triton` are different implementations of the much more optimized [Flash Attention](https://arxiv.org/abs/2205.14135) method. `triton` and `flash` will be faster (and use less GPU memory) than `torch`, but they might not work with all hardware and environment setups.

Our training setups typically use `triton`.
Our training setups typically use `flash`.

- **Long answer:** In NLP, Softmax Attention operates on a sequence. It is an all to all graph operation where, during training, the memory complexity is quadratic with respect to the length of the sequence. Furthermore, on GPUs, naive implementations of Softmax Attention are bandwidth (BW) limited.
[Rabe et al. (2021)](https://arxiv.org/abs/2112.05682) and [Dao et al. (2022)](https://arxiv.org/abs/2205.14135) showed that fusing all operations in Softmax Attention can make the operation much less BW limited.
Furthermore, integrating a recomputation schema decreases the sequence length memory complexity from *quadratic* to *linear*, thereby supporting much longer sequence lengths.

- Setting `attn_config.attn_impl=torch` enables a naive Softmax Attention written using base torch operations.
- Setting `attn_config.attn_impl=flash` enables Flash Attention [implemented by Dao et al in the HazyResearch repo using CUDA](https://github.com/HazyResearch/flash-attention). This will have linear memory complexity (enabling larger batch sizes) and will run much faster.
- Setting `attn_config.attn_impl=triton` enables a Flash Attention [implemented using Triton](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/models/layers/flash_attn_triton.py). In our experience, `triton` is slightly faster than `flash`.
- Setting `attn_config.attn_impl=flash` enables Flash Attention [implemented by Dao et al in the Dao-AILab repo using CUDA](https://github.com/Dao-AILab/flash-attention). This will have linear memory complexity (enabling larger batch sizes) and will run much faster.
- Setting `attn_config.attn_impl=triton` enables a Flash Attention [implemented using Triton](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/models/layers/flash_attn_triton.py). We recommend using `flash` attention instead of `triton` attention, unless you're training Prefix Language Models (in which case use `Triton`).

<!-- In NLP, Softmax Attention operates on a sequence. It is an all to all graph operation where, during training, the memory complexity is quadratic with respect to the length of the sequence. Furthermore, on GPUs, naive implementations of Softmax Attention are BW limited.
[Rabe et al. (2021)](https://arxiv.org/abs/2112.05682) and [Dao et al. (2022)](https://arxiv.org/abs/2205.14135) noted that fusing all operations in Softmax Attention can make the operation much less BW limited.
Expand All @@ -327,7 +327,7 @@ The majority of our training setups use `triton`. -->
#### Limitations
- For training, `torch` uses a lot of memory and is slow.
- `flash` and `triton` cannot return attention weights and therefore cannot be used with methods that require it.
- `flash` cannot accept an attention bias and therefore cannot be used with methods that require it such as ALiBi.
- `flash` cannot accept an attention bias. However, it still allows the use of ALiBi positional bias.

#### What is `triton-pre-mlir`?
- Torch2 installs and requires a specific version of [Triton](https://openai.com/research/triton).
Expand All @@ -352,7 +352,7 @@ Currently we support [Learned Positional Embeddings](https://arxiv.org/pdf/1706.
| Name | YAML Config | Training MFU on MPT-7B trained on 8 A100 80GB GPUs | Notes |
|:-----------------------------------|:------------------------------------------------------------------|:---------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Learned Positional Embeddings | <pre>model:<br> learned_pos_emb:&nbsp;True</pre>| 65.7 | |
| ALiBi | <pre>model:<br> attn_config:<br> alibi:&nbsp;True</pre>| 64.5 | Requires Triton or Torch attention. |
| ALiBi | <pre>model:<br> attn_config:<br> alibi:&nbsp;True</pre>| 64.5 | Requires Flash (v2.4.2 or higher) or Triton or Torch attention. |
| RoPE (Dao-AILab Implementation) | <pre>model:<br> attn_config:<br> rope:&nbsp;True<br> rope_impl:&nbsp;dail</pre>| 64.5 | Requires a CUDA GPU and the [flash-attn library](https://github.com/Dao-AILab/flash-attention) v2.0.1 or higher to be installed. Please see the instructions in the [paragraph above](#support-for-flashattention-2) on how to install flash-attn v2. Note that the attention implementation can still be `torch`, `triton`, or `flash`. |
| RoPE (Hugging<code>&nbsp;</code>Face Implementation) | <pre>model:<br> attn_config:<br> rope:&nbsp;True<br> rope_impl:&nbsp;hf</pre>| 62.3 | |

Expand Down
13 changes: 7 additions & 6 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,9 @@ def build_finetuning_dataloader(cfg: DictConfig,
'When using a HuggingFace dataset from a URL, you must set the ' + \
'`split` key in the dataset config.'
)
# HF datasets does not support a split with dashes, so we replace dashes
# with underscores.
split = split.replace('-', '_')
dataset_name_or_path = _download_remote_hf_dataset(
remote_path=dataset_name_or_path, split=split)
split = split.replace('-', '_')

# Get the preprocessing function.
proto_preprocessing_fn = cfg.dataset.get('preprocessing_fn')
Expand Down Expand Up @@ -309,17 +307,20 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
Raises:
FileNotFoundError: Raised if the dataset file cannot be found with any of the supported extensions.
"""
# HF datasets does not support a split with dashes, so we replace dashes with underscores.
hf_formatted_split = split.replace('-', '_')
finetune_dir = os.path.join(
DOWNLOADED_FT_DATASETS_DIRPATH,
split if split != 'data' else 'data_not',
hf_formatted_split if hf_formatted_split != 'data' else 'data_not',
)
os.makedirs(finetune_dir, exist_ok=True)
for extension in SUPPORTED_EXTENSIONS:
name = f'{remote_path.strip("/")}/{split}{extension}'
destination = str(
os.path.abspath(
os.path.join(finetune_dir, 'data',
f'{split}-00000-of-00001{extension}')))
os.path.join(
finetune_dir, 'data',
f'{hf_formatted_split}-00000-of-00001{extension}')))

# Since we don't know exactly what the extension will be, since it is one of a list
# use a signal file to wait for instead of the desired file
Expand Down
Loading

0 comments on commit 2cee526

Please sign in to comment.