Skip to content

Commit

Permalink
Add error to catch more unknown example types (#1562)
Browse files Browse the repository at this point in the history
  • Loading branch information
milocress authored Oct 1, 2024
1 parent ec4cafd commit b517297
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 0 deletions.
2 changes: 2 additions & 0 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ def _get_key(dictionary: Mapping[str, Any], allowed_keys: set[str]):
if not isinstance(dictionary, Mapping):
raise InvalidExampleTypeError(str(type(dictionary)))
desired_keys = allowed_keys.intersection(dictionary.keys())
if len(desired_keys) == 0:
raise UnknownExampleTypeError(str(set(dictionary.keys())))
return list(desired_keys)[0]


Expand Down
10 changes: 10 additions & 0 deletions tests/data/test_template_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,21 @@ def test_tokenize_chat_example_malformed():
}
wrong_example_type = ['this is not a dictionary']
wrong_messages_type = {'messages': 'this is not a list of messages'}
wrong_role = {
'messages': [{
'role': 'user',
'content': 'Hello GPT!',
}, {
'role': 'misnamed_assistant',
'content': 'user message not followed by an assistant label',
}],
}
malformed_chat_examples = [
too_few_messages,
no_content,
ends_with_user_role,
no_assistant_message,
wrong_role,
]
my_tokenizer = build_tokenizer('mosaicml/mpt-7b-8k-chat', {})
for example in malformed_chat_examples:
Expand Down

0 comments on commit b517297

Please sign in to comment.