-
-
Notifications
You must be signed in to change notification settings - Fork 906
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix double eos token for chatml (#1054) [skip ci]
* fix double eos token for chatml * isolate fix to chatml conversation * fix add special tokens to include rstrip * add test for train_on_inputs for sharegpt * don't use rstrip for chatml
- Loading branch information
Showing
2 changed files
with
158 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
""" | ||
Test module for sharegpt integration w chatml | ||
""" | ||
import pytest | ||
from datasets import Dataset | ||
from tokenizers import AddedToken | ||
from transformers import AutoTokenizer | ||
|
||
from axolotl.datasets import TokenizedPromptDataset | ||
from axolotl.prompt_strategies.sharegpt import SimpleShareGPTPromptTokenizingStrategy | ||
from axolotl.prompters import ShareGPTPrompterV2 | ||
|
||
|
||
@pytest.fixture(name="sharegpt_dataset") | ||
def fixture_sharegpt_dataset(): | ||
return Dataset.from_list( | ||
[ | ||
{ | ||
"conversations": [ | ||
{ | ||
"from": "system", | ||
"value": "repeat", | ||
}, | ||
{ | ||
"from": "human", | ||
"value": "hello", | ||
}, | ||
{ | ||
"from": "gpt", | ||
"value": "hello", | ||
}, | ||
{ | ||
"from": "human", | ||
"value": "goodbye", | ||
}, | ||
{ | ||
"from": "gpt", | ||
"value": "goodbye", | ||
}, | ||
] | ||
} | ||
] | ||
) | ||
|
||
|
||
@pytest.fixture(name="tokenizer") | ||
def fixture_tokenizer(): | ||
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") | ||
tokenizer.add_special_tokens( | ||
{ | ||
"eos_token": AddedToken( | ||
"<|im_end|>", rstrip=False, lstrip=False, normalized=False | ||
) | ||
} | ||
) | ||
tokenizer.add_tokens( | ||
[ | ||
AddedToken("<|im_start|>", rstrip=False, lstrip=False, normalized=False), | ||
] | ||
) | ||
|
||
return tokenizer | ||
|
||
|
||
class TestSharegpt: | ||
""" | ||
Test class for sharegpt prompter | ||
""" | ||
|
||
def test_no_double_im_end(self, sharegpt_dataset, tokenizer): | ||
strategy = SimpleShareGPTPromptTokenizingStrategy( | ||
ShareGPTPrompterV2( | ||
conversation="chatml", | ||
role_key_model=None, | ||
role_key_human=None, | ||
), | ||
tokenizer, | ||
False, # train_on_inputs | ||
2048, # sequence_len | ||
) | ||
|
||
dataset_wrapper = TokenizedPromptDataset( | ||
strategy, sharegpt_dataset, process_count=1 | ||
) | ||
|
||
input_ids = dataset_wrapper[0]["input_ids"] | ||
# fmt: off | ||
assert input_ids == [ | ||
# 28705, 13, is " \n" | ||
1, # bos | ||
32001, 1587, 13, 25997, 32000, 28705, 13, # system | ||
32001, 2188, 13, 21558, 32000, 28705, 13, # human | ||
32001, 13892, 13, 21558, 32000, 28705, 13, # gpt | ||
32001, 2188, 13, 12684, 17664, 32000, 28705, 13, # human | ||
32001, 13892, 13, 12684, 17664, 32000, 28705, 13, # gpt | ||
] | ||
# fmt: on | ||
|
||
def test_w_train_on_input(self, sharegpt_dataset, tokenizer): | ||
strategy = SimpleShareGPTPromptTokenizingStrategy( | ||
ShareGPTPrompterV2( | ||
conversation="chatml", | ||
role_key_model=None, | ||
role_key_human=None, | ||
), | ||
tokenizer, | ||
True, # train_on_inputs | ||
2048, # sequence_len | ||
) | ||
|
||
dataset_wrapper = TokenizedPromptDataset( | ||
strategy, sharegpt_dataset, process_count=1 | ||
) | ||
|
||
labels = dataset_wrapper[0]["labels"] | ||
# fmt: off | ||
assert labels == [ | ||
-100, # bos | ||
-100, -100, -100, -100, -100, -100, -100, # system | ||
-100, -100, -100, -100, -100, -100, -100, # human | ||
-100, -100, 13, 21558, 32000, 28705, 13, # gpt | ||
-100, -100, -100, -100, -100, -100, -100, -100, # human | ||
-100, -100, 13, 12684, 17664, 32000, 28705, 13, # gpt | ||
] | ||
# fmt: on | ||
|
||
# def test_no_train_on_input(self, sharegpt_dataset, tokenizer): | ||
# strategy = SimpleShareGPTPromptTokenizingStrategy( | ||
# ShareGPTPrompterV2( | ||
# conversation="chatml", | ||
# role_key_model=None, | ||
# role_key_human=None, | ||
# ), | ||
# tokenizer, | ||
# False, # train_on_inputs | ||
# 2048, # sequence_len | ||
# ) | ||
# | ||
# dataset_wrapper = TokenizedPromptDataset( | ||
# strategy, sharegpt_dataset, process_count=1 | ||
# ) | ||
# | ||
# labels = dataset_wrapper[0]["labels"] | ||
# # fmt: off | ||
# assert labels == [ | ||
# 1, # bos | ||
# 32001, 1587, 13, 25997, 32000, 28705, 13, # system | ||
# 32001, 2188, 13, 21558, 32000, 28705, 13, # human | ||
# 32001, 13892, 13, 21558, 32000, 28705, 13, # gpt | ||
# 32001, 2188, 13, 12684, 17664, 32000, 28705, 13, # human | ||
# 32001, 13892, 13, 12684, 17664, 32000, 28705, 13, # gpt | ||
# ] | ||
# # fmt: on |