Merge branch 'main' into dbfs-hf

mosaicml · Jun 5, 2024 · e5a0719 · e5a0719
2 parents 3adc789 + ac56dc5
commit e5a0719
Show file tree

Hide file tree

Showing 27 changed files with 1,724 additions and 272 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -2,6 +2,10 @@
 # This includes setup.py, the README, and the CODEOWNERS file itself!
 /* @mosaicml/composer-team-admins
 
+# Require team approval for code changes
+/llmfoundry/ @mosaicml/composer-team-eng
+/scripts/ @mosaicml/composer-team-eng
+
 # Require admin approval to change the CI build configuration
 # All CI Changes should be reviewed for security
 /.ci/ @mosaicml/composer-team-admins

diff --git a/Dockerfile b/Dockerfile
@@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py
 RUN rm setup.py
 
 # Install TransformerEngine
-RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install git+https://github.com/NVIDIA/TransformerEngine.git@05eb6deb31c1b48e9f4380d18fe95f3c38e84335
+RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/cli99/TransformerEngine.git@6b21f606f2459d49c2113d69236d68d334edeb4c
 
 # Install and uninstall foundry to cache foundry requirements
 RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git

diff --git a/README.md b/README.md
@@ -169,11 +169,11 @@ pip install -e ".[gpu]"  # or `pip install -e .` if no NVIDIA GPU.
 ```
 
 ### TransformerEngine and amp_fp8 support
-NVIDIA H100 GPUs have FP8 support; this additionally requires the following installations:
+NVIDIA H100 GPUs have FP8 support; we have installed Flash Attention and Transformer in our Docker images already (see above). If you are not using our Docker images, you can install these packages with:
 <!--pytest.mark.skip-->
 ```bash
-pip install flash-attn==1.0.7 --no-build-isolation
-pip install git+https://github.com/NVIDIA/TransformerEngine.git@v0.10
+pip install flash-attn --no-build-isolation
+pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
 ```
 
 See [here](https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#TransformerEngine-and-amp_fp8-support) for more details on enabling TransformerEngine layers and amp_fp8.

diff --git a/llmfoundry/callbacks/__init__.py b/llmfoundry/callbacks/__init__.py
@@ -22,6 +22,8 @@
 from llmfoundry.callbacks.log_mbmoe_tok_per_expert_callback import (
     MegaBlocksMoE_TokPerExpert,
 )
+from llmfoundry.callbacks.loss_perp_v_len_callback import \
+    LossPerpVsContextLengthLogger
 from llmfoundry.callbacks.monolithic_ckpt_callback import (
     MonolithicCheckpointSaver,
 )
@@ -52,6 +54,8 @@
 callbacks.register('mbmoe_tok_per_expert', func=MegaBlocksMoE_TokPerExpert)
 callbacks.register('run_timeout', func=RunTimeoutCallback)
 
+callbacks.register('loss_perp_v_len', func=LossPerpVsContextLengthLogger)
+
 callbacks_with_config.register('async_eval', func=AsyncEval)
 callbacks_with_config.register('curriculum_learning', func=CurriculumLearning)
 
@@ -66,4 +70,5 @@
     'MegaBlocksMoE_TokPerExpert',
     'AsyncEval',
     'CurriculumLearning',
+    'LossPerpVsContextLengthLogger',
 ]