diff --git a/.ci/FILE_HEADER b/.ci/FILE_HEADER index 22198520fd..e6d99f5d6f 100644 --- a/.ci/FILE_HEADER +++ b/.ci/FILE_HEADER @@ -1,2 +1,2 @@ -Copyright 2022 MosaicML LLM Foundry authors +Copyright 2024 MosaicML LLM Foundry authors SPDX-License-Identifier: Apache-2.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d4c8cc699c..4a5bf1f6bb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -57,7 +57,7 @@ repos: - id: mixed-line-ending - id: trailing-whitespace - repo: https://github.com/Lucas-C/pre-commit-hooks - rev: v1.3.1 + rev: v1.5.4 hooks: - id: insert-license args: @@ -65,6 +65,7 @@ repos: - .ci/FILE_HEADER - --comment-style - '#' + - --allow-past-years types: [python] - repo: https://github.com/PyCQA/docformatter rev: v1.5.0 diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py index 2632985533..eaaf0da316 100644 --- a/llmfoundry/tokenizers/tiktoken.py +++ b/llmfoundry/tokenizers/tiktoken.py @@ -198,7 +198,7 @@ def default_chat_template(self): '{% else %}' "{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}" '{% endif %}' - '{% if (add_generation_prompt == true) %}' + '{% if (add_generation_prompt == true and loop.last) %}' "{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}" "{% elif (message['role'] == 'assistant') %}" '{{ eos_token }}' diff --git a/setup.py b/setup.py index 2c4a05f396..8c43a309c3 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,6 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + """MosaicML LLM Foundry package setup.""" import os @@ -52,7 +55,7 @@ 'transformers>=4.36,<4.37', 'mosaicml-streaming>=0.7.2,<0.8', 'torch>=2.1,<2.1.1', - 'datasets==2.15.0', + 'datasets>=2.16,<2.17', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.1.97', 'einops==0.7.0', diff --git a/tests/tokenizers/test_tiktoken.py b/tests/tokenizers/test_tiktoken.py index 6a4d1c99c4..aca269af82 100644 --- a/tests/tokenizers/test_tiktoken.py +++ b/tests/tokenizers/test_tiktoken.py @@ -108,6 +108,12 @@ 'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.', 'role': 'user' +}, { + 'content': 'You should go outside and touch grass.', + 'role': 'assistant' +}, { + 'content': 'What else can I do?', + 'role': 'user' }]] MULTI_TURN_GENERATE_STRING = [ @@ -118,6 +124,10 @@ Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|> <|im_start|>assistant +You should go outside and touch grass.<|im_end|><|endoftext|> +<|im_start|>user +What else can I do?<|im_end|> +<|im_start|>assistant """ ]