Merge branch 'main' into EOS_token_checker

mosaicml · Feb 1, 2024 · 2cee526 · 2cee526
2 parents 785f906 + 203edad
commit 2cee526
Show file tree

Hide file tree

Showing 66 changed files with 274 additions and 236 deletions.
diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
@@ -24,10 +24,10 @@ jobs:
     strategy:
       matrix:
         python_version:
-        - '3.9'
-        - '3.10'
+        - "3.9"
+        - "3.10"
         pip_deps:
-        - '[dev]'
+        - "[dev]"
     steps:
     - uses: actions/checkout@v3
     - uses: actions/setup-python@v4

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
@@ -9,7 +9,7 @@
 # the `language` matrix defined below to confirm you have the correct set of
 # supported CodeQL languages.
 #
-name: 'CodeQL'
+name: "CodeQL"
 
 on:
   push:
@@ -18,7 +18,7 @@ on:
     # The branches below must be a subset of the branches above
     branches: [main]
   schedule:
-  - cron: '0 9 * * 1'  # Every Monday at 09:00 (9:00 AM)
+  - cron: "0 9 * * 1"  # Every Monday at 09:00 (9:00 AM)
 
 jobs:
   analyze:
@@ -32,7 +32,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        language: ['python']
+        language: ["python"]
         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
         # Learn more about CodeQL language support at https://git.io/codeql-language-support
 

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -17,18 +17,18 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: '2.1.0_cu121'
+        - name: "2.1.0_cu121"
           base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
-          dep_groups: '[gpu]'
-        - name: '2.1.0_cu121_flash2'
+          dep_groups: "[gpu]"
+        - name: "2.1.0_cu121_flash2"
           base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
-          dep_groups: '[gpu-flash2]'
-        - name: '2.1.0_cu121_aws'
+          dep_groups: "[gpu-flash2]"
+        - name: "2.1.0_cu121_aws"
           base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04-aws
-          dep_groups: '[gpu]'
-        - name: '2.1.0_cu121_flash2_aws'
+          dep_groups: "[gpu]"
+        - name: "2.1.0_cu121_flash2_aws"
           base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04-aws
-          dep_groups: '[gpu-flash2]'
+          dep_groups: "[gpu-flash2]"
     steps:
     - name: Maximize Build Space on Worker
       uses: easimon/maximize-build-space@v4
@@ -88,5 +88,6 @@ jobs:
         cache-from: type=registry,ref=${{ env.IMAGE_CACHE }}
         cache-to: type=registry,ref=${{ env.IMAGE_CACHE }},mode=max
         build-args: |
+          BRANCH_NAME=${{ github.head_ref || github.ref_name }}
           BASE_IMAGE=${{ matrix.base_image }}
           DEP_GROUPS=${{ matrix.dep_groups }}
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -19,10 +19,10 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: 'cpu-2.1.0'
+        - name: "cpu-2.1.0"
           container: mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04
-          markers: 'not gpu'
-          pytest_command: 'coverage run -m pytest'
+          markers: "not gpu"
+          pytest_command: "coverage run -m pytest"
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -19,16 +19,16 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: 'gpu-2.1.0'
+        - name: "gpu-2.1.0"
           container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
-          markers: 'gpu'
-          pytest_command: 'coverage run -m pytest'
-          deps_group: 'all'
-        - name: 'gpu-2.1.0-flash2'
+          markers: "gpu"
+          pytest_command: "coverage run -m pytest"
+          deps_group: "all"
+        - name: "gpu-2.1.0-flash2"
           container: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
-          markers: 'gpu'
-          pytest_command: 'coverage run -m pytest'
-          deps_group: 'all-flash2'
+          markers: "gpu"
+          pytest_command: "coverage run -m pytest"
+          deps_group: "all-flash2"
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:

diff --git a/.github/workflows/pytest-gpu.yaml b/.github/workflows/pytest-gpu.yaml
@@ -30,7 +30,7 @@ on:
         required: true
 jobs:
   pytest-gpu:
-    timeout-minutes: 60 # ${{ inputs.gha-timeout }} for some reason not able to turn this into an input
+    timeout-minutes: 60  # ${{ inputs.gha-timeout }} for some reason not able to turn this into an input
     runs-on: ubuntu-latest
     env:
       MOSAICML_API_KEY: ${{ secrets.mcloud-api-key }}

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -3,7 +3,7 @@ name: Release
 on:
   push:
     tags:
-    - 'v*'
+    - "v*"
   workflow_dispatch:
 
 jobs:
@@ -22,7 +22,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v3
       with:
-        python-version: '3.9'
+        python-version: "3.9"
 
     - name: Build source and wheel distributions
       run: |

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -62,9 +62,9 @@ repos:
   - id: insert-license
     args:
     - --license-filepath
-    - .ci/FILE_HEADER
+    - .pre-commit/FILE_HEADER
     - --comment-style
-    - '#'
+    - "#"
     - --allow-past-years
     types: [python]
 - repo: https://github.com/PyCQA/docformatter

diff --git a/.ci/FILE_HEADER → .pre-commit/FILE_HEADER b/.ci/FILE_HEADER → .pre-commit/FILE_HEADER
diff --git a/.yamllint.yaml b/.yamllint.yaml
@@ -5,7 +5,6 @@ yaml-files:
 
 ignore: |
   wandb
-  *
 
 rules:
   braces:
@@ -30,6 +29,7 @@ rules:
   key-duplicates: enable
   key-ordering: disable
   line-length:
+    max: 120
     allow-non-breakable-words: true
     allow-non-breakable-inline-mappings: true
   new-line-at-end-of-file: enable

diff --git a/Dockerfile b/Dockerfile
@@ -4,10 +4,16 @@
 ARG BASE_IMAGE
 FROM $BASE_IMAGE
 
+ARG BRANCH_NAME
 ARG DEP_GROUPS
 
+# Check for changes in setup.py.
+# If there are changes, the docker cache is invalidated and a fresh pip installation is triggered.
+ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py setup.py
+RUN rm setup.py
+
 # Install and uninstall foundry to cache foundry requirements
-RUN git clone -b main https://github.com/mosaicml/llm-foundry.git
+RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git
 RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}"
 RUN pip uninstall -y llm-foundry
 RUN rm -rf llm-foundry
diff --git a/README.md b/README.md
@@ -114,10 +114,10 @@ You can select a specific commit hash such as `mosaicml/llm-foundry:1.13.1_cu117
 | Docker Image                                           | Torch Version | Cuda Version      | LLM Foundry dependencies installed? |
 | ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- |
 | `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04`  | 2.1.0         | 12.1 (Infiniband) | No                                  |
-| `mosaicml/llm-foundry:2.1.0_cu121-latest`              | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v1)            |
-| `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest`       | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v2)            |
-| `mosaicml/llm-foundry:2.1.0_cu121_aws-latest`          | 2.1.0         | 12.1 (EFA)        | Yes (flash attention v1)            |
-| `mosaicml/llm-foundry:2.1.0_cu121_flash2_aws-latest`   | 2.1.0         | 12.1 (EFA)        | Yes (flash attention v2)            |
+| `mosaicml/llm-foundry:2.1.0_cu121-latest`              | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v1. Warning: Support for flash attention v1 has been deprecated.)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest`       | 2.1.0         | 12.1 (Infiniband) | Yes (flash attention v2. Note: We recommend using flash attention v2.)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_aws-latest`          | 2.1.0         | 12.1 (EFA)        | Yes (flash attention v1. Warning: Support for flash attention v1 has been deprecated.)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_flash2_aws-latest`   | 2.1.0         | 12.1 (EFA)        | Yes (flash attention v2. Note: We recommend using flash attention v2.)            |
 
 
 # Installation
@@ -134,7 +134,9 @@ We *strongly* recommend working with LLM Foundry inside a Docker container (see
 ```bash
 git clone https://github.com/mosaicml/llm-foundry.git
 cd llm-foundry
-pip install -e ".[gpu]"  # or pip install -e . if no NVIDIA GPU
+pip install -e ".[gpu-flash2]"  # or `pip install -e .` if no NVIDIA GPU.
+# Note: Currently, `pip install -e ".[gpu-flash2]"` installs Flash Attention v2, and `pip install -e ".[gpu]"` installs Flash Attention v1.
+#       However, once the support for Flash Attention v1 is removed, both of these commands will install Flash Attention v2.
 ```
 
 ### Without Docker (not recommended)
@@ -152,7 +154,9 @@ source llmfoundry-venv/bin/activate
 
 pip install cmake packaging torch  # setup.py requires these be installed
 
-pip install -e ".[gpu]"  # or pip install -e . if no NVIDIA GPU
+pip install -e ".[gpu-flash2]"  # or `pip install -e .` if no NVIDIA GPU.
+# Note: Currently, `pip install -e ".[gpu-flash2]"` installs Flash Attention v2, and `pip install -e ".[gpu]"` installs Flash Attention v1.
+#       However, once the support for Flash Attention v1 is removed, both of these commands will install Flash Attention v2.
 ```
 
 ### TransformerEngine and amp_fp8 support

diff --git a/TUTORIAL.md b/TUTORIAL.md
@@ -144,8 +144,8 @@ name = 'mosaicml/mpt-7b'
 
 # Download config
 config = AutoConfig.from_pretrained(name, trust_remote_code=True)
-# (Optional) Use `triton` backend for fast attention. Defaults to `torch`.
-# config.attn_config['attn_impl'] = 'triton'
+# (Optional) Use `flash` (preferred) or `triton` backend for fast attention. Defaults to `torch`.
+# config.attn_config['attn_impl'] = 'flash'
 # (Optional) Change the `max_seq_len` allowed for inference
 # config.max_seq_len = 4096
 
@@ -291,7 +291,7 @@ The purpose of this section is probably pretty self-evident. You’ve got questi
 - If OOMs persist with `device_train_microbatch_size: 1` and `device_eval_batch_size: 1`, you may need to use activation checkpointing `fsdp_config.activation_checkpointing: true` (if you are not already) and, as a last resort, activation CPU offloading `fsdp_config.activation_cpu_offload: true`.
 
 ### What hardware can I train on?
-- In general, this repo should work on any system with NVIDIA GPUs. Checkout the `scripts/train/README.md` for more [details on GPU memory requirements]([https://github.com/mosaicml/llm-foundry/tree/main/scripts/train#how-many-gpus-do-i-need-to-train-a-llm](https://github.com/mosaicml/llm-foundry/tree/main/scripts/train#how-many-gpus-do-i-need-to-train-a-llm)). Keep in mind you may run into issues with `Triton` support on some GPU types. In that situation, you can fall back to `attn_impl: torch` or raise an issue in the [Triton github repo](https://github.com/openai/triton).
+- In general, this repo should work on any system with NVIDIA GPUs. Checkout the `scripts/train/README.md` for more [details on GPU memory requirements]([https://github.com/mosaicml/llm-foundry/tree/main/scripts/train#how-many-gpus-do-i-need-to-train-a-llm](https://github.com/mosaicml/llm-foundry/tree/main/scripts/train#how-many-gpus-do-i-need-to-train-a-llm)). We recommend using `Flash` attention instead of `Triton` attention, unless you're training Prefix Language Models (in which case use `Triton`). Keep in mind you may run into issues with `Flash` or `Triton` support on some GPU types. In that situation, you can fall back to `attn_impl: torch`, or raise an issue in the [Flash Attention github repo](https://github.com/Dao-AILab/flash-attention).
 
 ### What hardware can I run eval on?
 - Similar to above…
@@ -305,15 +305,15 @@ The purpose of this section is probably pretty self-evident. You’ve got questi
 ### What are the different attention options `torch` / `flash` / `triton`  for MPT and which one should I use?
 - **Short answer:** `torch` is the native pytorch attention implementation, and `flash` and `triton` are different implementations of the much more optimized [Flash Attention](https://arxiv.org/abs/2205.14135) method. `triton` and `flash` will be faster (and use less GPU memory) than `torch`, but they might not work with all hardware and environment setups.
 
-  Our training setups typically use `triton`.
+  Our training setups typically use `flash`.
 
 - **Long answer:** In NLP, Softmax Attention operates on a sequence. It is an all to all graph operation where, during training, the memory complexity is quadratic with respect to the length of the sequence. Furthermore, on GPUs, naive implementations of Softmax Attention are bandwidth (BW) limited.
 [Rabe et al. (2021)](https://arxiv.org/abs/2112.05682) and [Dao et al. (2022)](https://arxiv.org/abs/2205.14135) showed that fusing all operations in Softmax Attention can make the operation much less BW limited.
 Furthermore, integrating a recomputation schema decreases the sequence length memory complexity from *quadratic* to *linear*, thereby supporting much longer sequence lengths.
 
   - Setting `attn_config.attn_impl=torch` enables a naive Softmax Attention written using base torch operations.
-  - Setting `attn_config.attn_impl=flash` enables Flash Attention [implemented by Dao et al in the HazyResearch repo using CUDA](https://github.com/HazyResearch/flash-attention). This will have linear memory complexity (enabling larger batch sizes) and will run much faster.
-  - Setting `attn_config.attn_impl=triton` enables a Flash Attention [implemented using Triton](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/models/layers/flash_attn_triton.py). In our experience, `triton` is slightly faster than `flash`.
+  - Setting `attn_config.attn_impl=flash` enables Flash Attention [implemented by Dao et al in the Dao-AILab repo using CUDA](https://github.com/Dao-AILab/flash-attention). This will have linear memory complexity (enabling larger batch sizes) and will run much faster.
+  - Setting `attn_config.attn_impl=triton` enables a Flash Attention [implemented using Triton](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/models/layers/flash_attn_triton.py). We recommend using `flash` attention instead of `triton` attention, unless you're training Prefix Language Models (in which case use `Triton`).
 
 <!-- In NLP, Softmax Attention operates on a sequence. It is an all to all graph operation where, during training, the memory complexity is quadratic with respect to the length of the sequence. Furthermore, on GPUs, naive implementations of Softmax Attention are BW limited.
 [Rabe et al. (2021)](https://arxiv.org/abs/2112.05682) and [Dao et al. (2022)](https://arxiv.org/abs/2205.14135) noted that fusing all operations in Softmax Attention can make the operation much less BW limited.
@@ -327,7 +327,7 @@ The majority of our training setups use `triton`. -->
 #### Limitations
 - For training, `torch` uses a lot of memory and is slow.
 - `flash` and `triton` cannot return attention weights and therefore cannot be used with methods that require it.
-- `flash` cannot accept an attention bias and therefore cannot be used with methods that require it such as ALiBi.
+- `flash` cannot accept an attention bias. However, it still allows the use of ALiBi positional bias.
 
 #### What is `triton-pre-mlir`?
 - Torch2 installs and requires a specific version of [Triton](https://openai.com/research/triton).
@@ -352,7 +352,7 @@ Currently we support [Learned Positional Embeddings](https://arxiv.org/pdf/1706.
 | Name                               | YAML Config                                                       | Training MFU on MPT-7B trained on 8 A100 80GB GPUs | Notes                                                                                                                                                                       |
 |:-----------------------------------|:------------------------------------------------------------------|:---------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | Learned Positional Embeddings      | <pre>model:<br>     learned_pos_emb:&nbsp;True</pre>| 65.7                                                |                                                                                                                                                                             |
-| ALiBi                              | <pre>model:<br>     attn_config:<br>         alibi:&nbsp;True</pre>| 64.5                                                |  Requires Triton or Torch attention.                                                                                                                                        |
+| ALiBi                              | <pre>model:<br>     attn_config:<br>         alibi:&nbsp;True</pre>| 64.5                                                |  Requires Flash (v2.4.2 or higher) or Triton or Torch attention.                                                                                                                                        |
 | RoPE (Dao-AILab Implementation)    | <pre>model:<br>     attn_config:<br>         rope:&nbsp;True<br>         rope_impl:&nbsp;dail</pre>| 64.5                                                | Requires a CUDA GPU and the [flash-attn library](https://github.com/Dao-AILab/flash-attention) v2.0.1 or higher to be installed. Please see the instructions in the [paragraph above](#support-for-flashattention-2) on how to install flash-attn v2. Note that the attention implementation can still be `torch`, `triton`, or `flash`. |
 | RoPE (Hugging<code>&nbsp;</code>Face Implementation)  | <pre>model:<br>     attn_config:<br>         rope:&nbsp;True<br>         rope_impl:&nbsp;hf</pre>| 62.3                                                |                                                                                                                                                                             |
 

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
@@ -167,11 +167,9 @@ def build_finetuning_dataloader(cfg: DictConfig,
                     'When using a HuggingFace dataset from a URL, you must set the ' + \
                     '`split` key in the dataset config.'
                 )
-            # HF datasets does not support a split with dashes, so we replace dashes
-            # with underscores.
-            split = split.replace('-', '_')
             dataset_name_or_path = _download_remote_hf_dataset(
                 remote_path=dataset_name_or_path, split=split)
+            split = split.replace('-', '_')
 
         # Get the preprocessing function.
         proto_preprocessing_fn = cfg.dataset.get('preprocessing_fn')
@@ -309,17 +307,20 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
     Raises:
         FileNotFoundError: Raised if the dataset file cannot be found with any of the supported extensions.
     """
+    # HF datasets does not support a split with dashes, so we replace dashes with underscores.
+    hf_formatted_split = split.replace('-', '_')
     finetune_dir = os.path.join(
         DOWNLOADED_FT_DATASETS_DIRPATH,
-        split if split != 'data' else 'data_not',
+        hf_formatted_split if hf_formatted_split != 'data' else 'data_not',
     )
     os.makedirs(finetune_dir, exist_ok=True)
     for extension in SUPPORTED_EXTENSIONS:
         name = f'{remote_path.strip("/")}/{split}{extension}'
         destination = str(
             os.path.abspath(
-                os.path.join(finetune_dir, 'data',
-                             f'{split}-00000-of-00001{extension}')))
+                os.path.join(
+                    finetune_dir, 'data',
+                    f'{hf_formatted_split}-00000-of-00001{extension}')))
 
         # Since we don't know exactly what the extension will be, since it is one of a list
         # use a signal file to wait for instead of the desired file