diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 27516b4ce2..e62b01fa52 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -88,5 +88,6 @@ jobs: cache-from: type=registry,ref=${{ env.IMAGE_CACHE }} cache-to: type=registry,ref=${{ env.IMAGE_CACHE }},mode=max build-args: | + BRANCH_NAME=${{ github.head_ref || github.ref_name }} BASE_IMAGE=${{ matrix.base_image }} DEP_GROUPS=${{ matrix.dep_groups }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2c85e7801e..62bc853fb5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -62,7 +62,7 @@ repos: - id: insert-license args: - --license-filepath - - .ci/FILE_HEADER + - .pre-commit/FILE_HEADER - --comment-style - "#" - --allow-past-years diff --git a/.ci/FILE_HEADER b/.pre-commit/FILE_HEADER similarity index 100% rename from .ci/FILE_HEADER rename to .pre-commit/FILE_HEADER diff --git a/Dockerfile b/Dockerfile index 6c283660c4..fd5b8e40ab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,10 +4,16 @@ ARG BASE_IMAGE FROM $BASE_IMAGE +ARG BRANCH_NAME ARG DEP_GROUPS +# Check for changes in setup.py. +# If there are changes, the docker cache is invalidated and a fresh pip installation is triggered. +ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py setup.py +RUN rm setup.py + # Install and uninstall foundry to cache foundry requirements -RUN git clone -b main https://github.com/mosaicml/llm-foundry.git +RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}" RUN pip uninstall -y llm-foundry RUN rm -rf llm-foundry diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 0d8264f570..e541d8209e 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -409,6 +409,10 @@ def build_tokenizer( int(1e30), ) + if not hasattr(tokenizer, 'eos_token') or tokenizer.eos_token is None: + raise ValueError( + f'The tokenizer {tokenizer_name} must have an eos_token.') + if dist.is_available() and dist.is_initialized( ) and dist.get_world_size() > 1: if dist.get_local_rank() == 0: diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index d4ba39acfa..fb4b75ec31 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -18,9 +18,11 @@ from composer.utils import dist, get_device, reproducibility from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om +from rich.traceback import install from transformers import (AutoModelForCausalLM, PreTrainedTokenizerBase, T5ForConditionalGeneration) +install() from llmfoundry.models import MPTForCausalLM from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, diff --git a/scripts/train/train.py b/scripts/train/train.py index 7bb5e71394..47385424e8 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -20,8 +20,10 @@ from composer.utils import dist, get_device, reproducibility from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om +from rich.traceback import install from transformers import PreTrainedTokenizerBase +install() from llmfoundry import (COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM, MPTForCausalLM) from llmfoundry.callbacks import AsyncEval diff --git a/tests/utils/test_builders.py b/tests/utils/test_builders.py index 303afc9b7d..b35e053c5d 100644 --- a/tests/utils/test_builders.py +++ b/tests/utils/test_builders.py @@ -48,6 +48,13 @@ def test_tokenizer_builder(tokenizer_name: str, tokenizer_kwargs: dict): assert isinstance(tokenizer, PreTrainedTokenizerBase) +def test_tokenizer_no_EOS(): + with pytest.raises( + ValueError, + match='The tokenizer bert-base-uncased must have an eos_token.'): + build_tokenizer('bert-base-uncased', {}) + + def test_build_callback_fails(): with pytest.raises(ValueError): build_callback('nonexistent_callback', {}, {})