Skip to content

Commit

Permalink
Merge branch 'main' into composer_lora
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Jan 22, 2024
2 parents 03a1c57 + b2a0c03 commit 7cb401b
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .ci/FILE_HEADER
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Copyright 2022 MosaicML LLM Foundry authors
Copyright 2024 MosaicML LLM Foundry authors
SPDX-License-Identifier: Apache-2.0
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,15 @@ repos:
- id: mixed-line-ending
- id: trailing-whitespace
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.3.1
rev: v1.5.4
hooks:
- id: insert-license
args:
- --license-filepath
- .ci/FILE_HEADER
- --comment-style
- '#'
- --allow-past-years
types: [python]
- repo: https://github.com/PyCQA/docformatter
rev: v1.5.0
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/tokenizers/tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def default_chat_template(self):
'{% else %}'
"{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
'{% endif %}'
'{% if (add_generation_prompt == true) %}'
'{% if (add_generation_prompt == true and loop.last) %}'
"{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
"{% elif (message['role'] == 'assistant') %}"
'{{ eos_token }}'
Expand Down
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

"""MosaicML LLM Foundry package setup."""

import os
Expand Down Expand Up @@ -52,7 +55,7 @@
'transformers>=4.36,<4.37',
'mosaicml-streaming>=0.7.2,<0.8',
'torch>=2.1,<2.1.1',
'datasets==2.15.0',
'datasets>=2.16,<2.17',
'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data
'sentencepiece==0.1.97',
'einops==0.7.0',
Expand Down
10 changes: 10 additions & 0 deletions tests/tokenizers/test_tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@
'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.',
'role':
'user'
}, {
'content': 'You should go outside and touch grass.',
'role': 'assistant'
}, {
'content': 'What else can I do?',
'role': 'user'
}]]

MULTI_TURN_GENERATE_STRING = [
Expand All @@ -118,6 +124,10 @@
Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
<|im_start|>assistant
You should go outside and touch grass.<|im_end|><|endoftext|>
<|im_start|>user
What else can I do?<|im_end|>
<|im_start|>assistant
"""
]

Expand Down

0 comments on commit 7cb401b

Please sign in to comment.