Merge branch 'main' into shashank/seq_id_flash_attn

mosaicml · Nov 21, 2023 · 55625ff · 55625ff
2 parents 5d7805d + 9bf21f2
commit 55625ff
Show file tree

Hide file tree

Showing 68 changed files with 72,427 additions and 338 deletions.
diff --git a/.github/mcp/mcp_pytest.py b/.github/mcp/mcp_pytest.py
@@ -54,6 +54,9 @@
                         type=int,
                         default=1800,
                         help='Timeout for run (in seconds)')
+    parser.add_argument('--deps_group',
+                        type=str,
+                        help='Dependency group to install')
     args = parser.parse_args()
 
     name = args.name
@@ -89,7 +92,7 @@
     clear_tmp_path_flag = '-o tmp_path_retention_policy=none'
     command += f'''
 
-    pip install --upgrade --user .[all]
+    pip install --upgrade --user .[{args.deps_group}]
 
     export COMMON_ARGS="-v --durations=20 -m '{args.pytest_markers}' {clear_tmp_path_flag}"
 

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -19,12 +19,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: 'cpu-latest'
-          container: mosaicml/pytorch:latest_cpu  # mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
-          markers: 'not gpu'
-          pytest_command: 'coverage run -m pytest'
-        - name: 'cpu-2.0.1'
-          container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
+        - name: 'cpu-1.13.1'
+          container: mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
           markers: 'not gpu'
           pytest_command: 'coverage run -m pytest'
         - name: 'cpu-2.1.0'

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -18,24 +18,22 @@ jobs:
     uses: ./.github/workflows/pytest-gpu.yaml
     strategy:
       matrix:
-        # TODO: After the PR with the flash attention 2 images goes in, add the new unit test suite
         include:
-        - name: 'gpu-latest'
-          container: mosaicml/pytorch:latest  # mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
-          markers: 'gpu'
-          pytest_command: 'coverage run -m pytest'
-        - name: 'gpu-2.0.1'
-          container: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
+        - name: 'gpu-1.13.1'
+          container: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
           markers: 'gpu'
           pytest_command: 'coverage run -m pytest'
+          deps_group: 'all'
         - name: 'gpu-2.1.0'
           container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
           markers: 'gpu'
           pytest_command: 'coverage run -m pytest'
+          deps_group: 'all'
         - name: 'gpu-2.1.0-flash2'
           container: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
           markers: 'gpu'
           pytest_command: 'coverage run -m pytest'
+          deps_group: 'all-flash2'
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:
@@ -45,5 +43,6 @@ jobs:
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
       python-version: 3.9
+      deps-group: ${{ matrix.deps_group }}
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
diff --git a/.github/workflows/pytest-gpu.yaml b/.github/workflows/pytest-gpu.yaml
@@ -22,6 +22,9 @@ on:
         required: false
         type: string
         default: 3.9
+      deps-group:
+        required: true
+        type: string
     secrets:
       mcloud-api-key:
         required: true
@@ -77,4 +80,5 @@ jobs:
               --image '${{ inputs.container }}' \
               --pytest_markers '${{ inputs.pytest-markers }}' \
               --pytest_command '${{ inputs.pytest-command }}' \
-              --timeout ${{ inputs.mcloud-timeout }} ${REF_ARGS}
+              --timeout ${{ inputs.mcloud-timeout }} ${REF_ARGS} \
+              --deps_group ${{ inputs.deps-group }}
diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import torch
+from composer.utils import using_torch_2
 from omegaconf import DictConfig
 from transformers import PreTrainedTokenizerBase
 
@@ -347,7 +348,7 @@ def profile_packing(
     dataloader_cfg.dataset.packing_ratio = None
     dataloader_cfg.drop_last = False
     dataloader_cfg.num_workers = 0
-    dataloader_cfg.prefetch_factor = None
+    dataloader_cfg.prefetch_factor = None if using_torch_2() else 2
     dataloader_cfg.persistent_workers = False
 
     # Determine the packing_ratio values we'll try

diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
@@ -109,7 +109,7 @@ def __init__(
             init_device (str): The device to use for parameter initialization.
             logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
             no_bias (bool): Whether to use bias in all layers.
-            verbose (int): The verbosity level. 0 is silent.
+            verbose (int): Deprecated.
             embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
             norm_type (str): choose type of norm to use
             use_cache (bool): Whether or not the model should return the last key/values attentions

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
@@ -319,11 +319,12 @@ def _validate_cfg(icl_cfg: DictConfig):
                 prompt_string=icl_cfg.prompt_string,
                 example_delimiter=icl_cfg.example_delimiter,
                 continuation_delimiter=icl_cfg.continuation_delimiter,
+                question_prelimiter=icl_cfg.get('question_prelimiter', ''),
                 destination_path=destination_path,
                 pass_at_k=icl_cfg.pass_at_k,
                 generations_per_sample=icl_cfg.num_beams,
                 has_categories=icl_cfg.get('has_categories', False),
-            )
+                cot_delimiter=icl_cfg.get('cot_delimiter', ''))
             if hasattr(
                     icl_cfg,
                     'has_categories') and icl_cfg.has_categories and isinstance(

diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml
@@ -123,7 +123,6 @@ parameters:
     activation_checkpointing_reentrant: false
     activation_cpu_offload: false
     limit_all_gathers: true
-    verbose: false
 
   # Logging
   progress_bar: false

diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
@@ -127,7 +127,6 @@ parameters:
     activation_checkpointing_reentrant: false
     activation_cpu_offload: false
     limit_all_gathers: true
-    verbose: false
 
   # Logging
   progress_bar: false

diff --git a/scripts/data_prep/convert_dataset_hf.py b/scripts/data_prep/convert_dataset_hf.py
@@ -186,6 +186,11 @@ def __init__(self,
                                                       folder_split='val_xsmall',
                                                       raw_samples=3000,
                                                       truncated_samples=3000)
+c4constants.splits['val_xxsmall'] = DataSplitConstants(
+    hf_split='validation',
+    folder_split='val_xxsmall',
+    raw_samples=100,
+    truncated_samples=100)
 
 CONSTS = {'c4': c4constants, 'the_pile': pileconstants}