Merge pull request #3 from mosaicml/main

Pulling from the main repository
mosaicml · Oct 17, 2023 · 76a2095 · 76a2095
2 parents 8b886ba + cc238a3
commit 76a2095
Show file tree

Hide file tree

Showing 35 changed files with 2,159 additions and 514 deletions.
diff --git a/.github/mcp/mcp_pytest.py b/.github/mcp/mcp_pytest.py
@@ -93,10 +93,10 @@
 
     export COMMON_ARGS="-v --durations=20 -m '{args.pytest_markers}' {clear_tmp_path_flag}"
 
-    make test PYTEST='{args.pytest_command}' EXTRA_ARGS="$COMMON_ARGS --codeblocks"
-
     make test-dist PYTEST='{args.pytest_command}' EXTRA_ARGS="$COMMON_ARGS" WORLD_SIZE=2
 
+    make test PYTEST='{args.pytest_command}' EXTRA_ARGS="$COMMON_ARGS --codeblocks"
+
     python -m coverage combine
 
     python -m coverage report

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -3,6 +3,12 @@ on:
   push:
     branches:
     - main
+  pull_request:
+    branches:
+    - main
+    paths:
+    - ./Dockerfile
+    - .github/workflows/docker.yaml
   workflow_dispatch: {}
 jobs:
   docker-build:
@@ -13,10 +19,16 @@ jobs:
         include:
         - name: '1.13.1_cu117'
           base_image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+          dep_groups: '[gpu]'
         - name: '2.0.1_cu118'
           base_image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
+          dep_groups: '[gpu]'
         - name: '2.1.0_cu121'
           base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
+          dep_groups: '[gpu]'
+        - name: '2.1.0_cu121_flash2'
+          base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
+          dep_groups: '[gpu-flash2]'
 
     steps:
     - name: Maximize Build Space on Worker
@@ -52,13 +64,32 @@ jobs:
         GIT_SHA=$(echo ${{ github.sha }} | cut -c1-7)
         echo "IMAGE_TAG=${GIT_SHA}" >> ${GITHUB_ENV}
 
+        if [ "${{ github.event_name }}" == "push" ]; then
+          echo "Triggered by push event."
+          PROD_REPO="mosaicml/llm-foundry"
+          IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest"
+          IMAGE_CACHE="${PROD_REPO}:${{matrix.name}}-buildcache"
+        elif [ "${{ github.event_name }}" == "pull_request" ]; then
+          echo "Triggered by pull_request event."
+          STAGING_REPO="mosaicml/ci-staging"
+          IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA}"
+          IMAGE_CACHE="${STAGING_REPO}:${{matrix.name}}-buildcache"
+        else
+          echo "Triggered by unknown event: ${{ github.event_name }}"
+          exit 1
+        fi
+
+        echo "IMAGE_TAG=${IMAGE_TAG}" >> ${GITHUB_ENV}
+        echo "IMAGE_CACHE=${IMAGE_CACHE}" >> ${GITHUB_ENV}
+
     - name: Build and Push the Docker Image
       uses: docker/build-push-action@v3
       with:
         context: .
-        tags: mosaicml/llm-foundry:${{ matrix.name }}-latest,
-          mosaicml/llm-foundry:${{ matrix.name }}-${{ env.IMAGE_TAG }}
+        tags: ${{ env.IMAGE_TAG }}
         push: true
-        cache-from: type=registry,ref=mosaicml/llm-foundry:${{ matrix.name }}-buildcache
-        cache-to: type=registry,ref=mosaicml/llm-foundry:${{ matrix.name }}-buildcache,mode=max
-        build-args: BASE_IMAGE=${{ matrix.base_image }}
+        cache-from: type=registry,ref=${{ env.IMAGE_CACHE }}
+        cache-to: type=registry,ref=${{ env.IMAGE_CACHE }},mode=max
+        build-args: |
+          BASE_IMAGE=${{ matrix.base_image }}
+          DEP_GROUPS=${{ matrix.dep_groups }}
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -18,6 +18,7 @@ jobs:
     uses: ./.github/workflows/pytest-gpu.yaml
     strategy:
       matrix:
+        # TODO: After the PR with the flash attention 2 images goes in, add the new unit test suite
         include:
         - name: 'gpu-latest'
           container: mosaicml/pytorch:latest  # mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
@@ -31,6 +32,10 @@ jobs:
           container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
           markers: 'gpu'
           pytest_command: 'coverage run -m pytest'
+        - name: 'gpu-2.1.0-flash2'
+          container: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
+          markers: 'gpu'
+          pytest_command: 'coverage run -m pytest'
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:

diff --git a/Dockerfile b/Dockerfile
@@ -4,9 +4,10 @@
 ARG BASE_IMAGE
 FROM $BASE_IMAGE
 
+ARG DEP_GROUPS
 
 # Install and uninstall foundry to cache foundry requirements
-RUN git clone -b main https://github.com/mosaicml/llm-foundry.git && \
-    pip install --no-cache-dir "./llm-foundry[gpu]" && \
-    pip uninstall -y llm-foundry && \
-    rm -rf llm-foundry
+RUN git clone -b main https://github.com/mosaicml/llm-foundry.git
+RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}"
+RUN pip uninstall -y llm-foundry
+RUN rm -rf llm-foundry
diff --git a/README.md b/README.md
@@ -93,8 +93,10 @@ If you have success/failure using LLM Foundry on other systems, please let us kn
 |---------------------------|------------------|--------------|-------------------------------|
 | A100-40GB/80GB            | 1.13.1           | 11.7         | :white_check_mark: Supported  |
 | A100-40GB/80GB            | 2.0.1            | 11.7, 11.8   | :white_check_mark: Supported  |
+| A100-40GB/80GB            | 2.1.0            | 11.8, 12.1   | :white_check_mark: Supported  |
 | H100-80GB                 | 1.13.1           | 11.7         | :x: Not Supported             |
 | H100-80GB                 | 2.0.1            | 11.8         | :white_check_mark: Supported  |
+| H100-80GB                 | 2.1.0            | 12.1         | :white_check_mark: Supported  |
 | A10-24GB                  | 1.13.1           | 11.7         | :construction: In Progress    |
 | A10-24GB                  | 2.0.1            | 11.7, 11.8   | :construction: In Progress    |
 | MI250                     | 2.0.1            | ROCm 5.4     | :construction: In Progress    |
@@ -113,8 +115,11 @@ You can select a specific commit hash such as `mosaicml/llm-foundry:1.13.1_cu117
 |-------------------------------------------------------------|----------------|--------------|-------------------------------------|
 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04`      | 1.13.1         | 11.7         | No                                  |
 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04`       | 2.0.1          | 11.8         | No                                  |
+| `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04`       | 2.1.0          | 12.1         | No                                  |
 | `mosaicml/llm-foundry:1.13.1_cu117-latest`                  | 1.13.1         | 11.7         | Yes                                 |
 | `mosaicml/llm-foundry:2.0.1_cu118-latest`                   | 2.0.1          | 11.8         | Yes                                 |
+| `mosaicml/llm-foundry:2.1.0_cu121-latest`                   | 2.1.0          | 12.1         | Yes (flash attention v1)            |
+| `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest`            | 2.1.0          | 12.1         | Yes (flash attention v2)            |
 
 
 # Installation

diff --git a/llmfoundry/callbacks/generate_callback.py b/llmfoundry/callbacks/generate_callback.py
@@ -1,119 +1,30 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-"""Periodically log generations to wandb from a set of prompts."""
-from typing import Any, List, Union, cast
+"""Deprecated Generate callback.
 
-import torch
-import wandb
-from composer.core import Callback, State, get_precision_context
-from composer.loggers import Logger, WandBLogger
-from composer.utils import dist, ensure_tuple
+Please use composer.callbacks.Generate instead.
+"""
+import warnings
+from typing import Any, List, Union
+
+from composer.callbacks import Generate as ComposerGenerate
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 
 
-class Generate(Callback):
+class Generate(ComposerGenerate):
 
     def __init__(self, prompts: List[str], batch_log_interval: int,
                  **kwargs: Any):
-        """Periodically log generations to wandb from a set of prompts.
-
-        In the main view for a run, there will be a table that will show the _last_ logged generations.
-        To compare previous iterations of the generations, you need to
-        1. Click on the run
-        2. Click on "artifacts" in the menu on the left side of the screen
-        3. Click on one of the artifacts called "predictions"
-        4. Click on the "files" tab
-        5. Click on "predictions.table.json"
-        6. On the left hand side, there are different versions of the table produced throughout training. Select one of these.
-        7. Now, when you hover over other versions, there will be a "compare" button, which will allow you to compare the currently
-            selected version to the version you add via compare.
-
-        Args:
-            prompts (List[str]): The list of prompts you would like to produce generations for
-            batch_log_interval (int): The interval (in batches) at which this callback runs
-            kwargs: All kwargs well be passed along to the call to generate. This is for things like `do_sample`, `top_p`, etc
-        """
-        self.prompts = prompts
-        self.batch_log_interval = batch_log_interval
-        self.generate_kwargs = kwargs
-        self.wandb_logger = None
-
-    def init(self, state: State, logger: Logger):
-        if dist.get_global_rank() == 0:
-            for destination in ensure_tuple(logger.destinations):
-                if isinstance(destination, WandBLogger):
-                    self.wandb_logger = destination
-
-    def batch_checkpoint(self, state: State, logger: Logger) -> None:
-        if (state.timestamp.batch.value % self.batch_log_interval) == 0:
-            self.generate(state, logger)
-
-    def generate(self, state: State, logger: Logger) -> None:
-        model = state.model
-        original_mode = model.training
-        model.eval()
-        tokenizer = cast(Tokenizer, state.model.tokenizer)
-        device = state.device
-
-        if not hasattr(model.model, 'generate'):
-            raise ValueError(
-                f'Cannot generate from model {model.model.__class__.__name__} because it does not have a `generate` method'
-            )
-
-        # stash the original original value of padding_side because generation requires left padding
-        original_padding_side = tokenizer.padding_side
-        tokenizer.padding_side = 'left'
-        if tokenizer.pad_token_id is None:
-            tokenizer.pad_token_id = tokenizer.eos_token_id
-        tokenized_input = tokenizer(self.prompts,
-                                    return_tensors='pt',
-                                    padding=True)
-
-        for k, v in tokenized_input.items():
-            tokenized_input[k] = device.tensor_to_device(v)
-
-        # dummy forward call needed for FSDP to work consistently
-        dummy_input = torch.tensor([[0]], dtype=torch.long)
-        dummy_input = device.tensor_to_device(dummy_input)
-        with get_precision_context(state.precision):
-            with torch.no_grad():
-                assert isinstance(model.model, torch.nn.Module)
-                _ = model.model(input_ids=dummy_input)
-
-            output_token_ids = model.model.generate(  # type: ignore
-                input_ids=tokenized_input['input_ids'],
-                attention_mask=tokenized_input['attention_mask'],
-                synced_gpus=True,
-                **self.generate_kwargs,
-            )
-
-        if dist.get_global_rank() == 0:
-            if self.wandb_logger is not None:
-                assert wandb.run is not None, 'wandb should have started run'
-
-                artifact = wandb.Artifact('generate_samples_' +
-                                          str(wandb.run.id),
-                                          type='predictions')
-
-                rows = []
-                for i in range(len(self.prompts)):
-                    prompt = self.prompts[i]
-                    output_tokens = output_token_ids[i][
-                        tokenized_input['input_ids'].shape[1]:]
-                    output_text = tokenizer.decode(output_tokens,
-                                                   skip_special_tokens=True)
-
-                    rows.append([prompt, output_text])
 
-                text_table = wandb.Table(data=rows,
-                                         columns=['prompt', 'generation'])
-                artifact.add(text_table, 'predictions')
-                wandb.log_artifact(artifact)
-                wandb.log({'generations': text_table},
-                          step=state.timestamp.batch.value)
+        warnings.warn(
+            ('Accessing llmfoundry.callbacks.generate_callback.Generate '
+             'is deprecated and will be removed in a future release. '
+             'Please use composer.callbacks.Generate instead.'),
+            DeprecationWarning,
+        )
 
-        tokenizer.padding_side = original_padding_side
-        model.train(mode=original_mode)
+        interval = f'{batch_log_interval}ba'
+        super().__init__(prompts=prompts, interval=interval, **kwargs)