Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
milocress committed Jan 18, 2024
1 parent 2148405 commit b660dbc
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 16 deletions.
1 change: 0 additions & 1 deletion llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ def _tokenize_formatted_example(
example: Conversation,
tokenizer: PreTrainedTokenizerBase) -> TokenizedConversation:
example_format = _get_conversation_type(example)
print(f'{example_format=}')

if example_format == 'chat':
chat_example: ChatFormattedDict = example # type: ignore
Expand Down
25 changes: 10 additions & 15 deletions tests/data/test_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from argparse import Namespace
from contextlib import nullcontext as does_not_raise
from pathlib import Path
from typing import ContextManager, Literal, Optional, Union
from unittest.mock import MagicMock

import pytest
Expand All @@ -22,21 +23,18 @@
from llmfoundry import (build_finetuning_dataloader,
build_text_denoising_dataloader)
from llmfoundry.data import build_dataloader
from llmfoundry.data.finetuning.tasks import (_ALLOWED_PROMPT_KEYS,
_ALLOWED_RESPONSE_KEYS,
DOWNLOADED_FT_DATASETS_DIRPATH,
SUPPORTED_EXTENSIONS,
_tokenize_formatted_example)
from llmfoundry.data.text_data import (ConcatenatedSequenceCollatorWrapper,
build_text_dataloader,
get_tokens_per_batch_func)
from llmfoundry.utils.builders import build_tokenizer
from scripts.data_prep.convert_dataset_hf import main as main_hf
from tests.data_utils import make_tiny_ft_dataset

from llmfoundry.data.finetuning.tasks import ( # isort:skip
_ALLOWED_PROMPT_KEYS, _ALLOWED_RESPONSE_KEYS, # isort:skip
DOWNLOADED_FT_DATASETS_DIRPATH, SUPPORTED_EXTENSIONS,
ChatFormattedDict, # isort:skip
PromptResponseDict, _tokenize_formatted_example) # isort:skip

from typing import ContextManager, List, Literal, Optional, Union # isort:skip


def get_config(conf_path: str = 'yamls/mpt/125m.yaml'):
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
Expand Down Expand Up @@ -431,7 +429,7 @@ def test_tokenize_example_malformed():
'completion': 'completion'
}
no_content = {'messages': [{'role': 'user'}]}
ends_with_user_role: ChatFormattedDict = {
ends_with_user_role = {
'messages': [{
'role': 'user',
'content': 'Hello GPT!'
Expand All @@ -443,7 +441,7 @@ def test_tokenize_example_malformed():
'content': 'user message not followed by an assistant label'
}]
}
no_assistant_message: ChatFormattedDict = {
no_assistant_message = {
'messages': [{
'role': 'user',
'content': 'Hello GPT!'
Expand Down Expand Up @@ -479,15 +477,12 @@ def test_tokenize_example_well_formed():
for prompt_key in _ALLOWED_PROMPT_KEYS:
for response_key in _ALLOWED_RESPONSE_KEYS:

example: PromptResponseDict = {
prompt_key: 'prompt',
response_key: 'response'
}
example = {prompt_key: 'prompt', response_key: 'response'}
tokenized_example = _tokenize_formatted_example(example, tokenizer)
assert 'input_ids' in tokenized_example
assert 'labels' in tokenized_example

chat_examples: List[ChatFormattedDict] = [
chat_examples = [
{
'messages': [{
'role': 'user',
Expand Down

0 comments on commit b660dbc

Please sign in to comment.