Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
allow case insensitive exampple keys
Browse files Browse the repository at this point in the history
tmp

fix

fix

revert
Ubuntu committed May 31, 2024
1 parent 69d9d29 commit 051fcab
Showing 3 changed files with 31 additions and 13 deletions.
2 changes: 1 addition & 1 deletion llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
@@ -542,7 +542,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
except FileNotFoundError as e:
if extension == SUPPORTED_EXTENSIONS[-1]:
files_searched = [
f'{name[:-len(extension)]}{ext}' for ext in SUPPORTED_EXTENSIONS
f'{name}/{split}{ext}' for ext in SUPPORTED_EXTENSIONS
]
raise FileNotFoundError(
f'Could not find a file with any of ' + \
33 changes: 21 additions & 12 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
@@ -35,7 +35,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
import logging
import os
import warnings
from collections.abc import Mapping
from collections.abc import KeysView, Mapping
from functools import partial
from pathlib import Path
from typing import (
@@ -71,6 +71,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
ALLOWED_RESPONSE_KEYS,
ChatTemplateError,
ConsecutiveRepeatedChatRolesError,
ExampleDatasetKeyCaseError,
IncorrectMessageKeyQuantityError,
InvalidContentTypeError,
InvalidFileExtensionError,
@@ -134,20 +135,28 @@ def _get_example_type(example: Example) -> ExampleType:
raise TypeError(
f'Expected example to be a Mapping, but found {type(example)}',
)
if (
len(example.keys()) == 1 and any(

def match_keys(keys: KeysView) -> str:
if len(keys) == 1 and any(
allowed_message_key in example
for allowed_message_key in ALLOWED_MESSAGES_KEYS
)
):
return 'chat'
elif (
len(example.keys()) == 2 and
any(p in example for p in ALLOWED_PROMPT_KEYS) and
any(r in example for r in ALLOWED_RESPONSE_KEYS)
):
return 'prompt_response'
):
return 'chat'
elif (
len(example.keys()) == 2 and
any(p in example for p in ALLOWED_PROMPT_KEYS) and
any(r in example for r in ALLOWED_RESPONSE_KEYS)
):
return 'prompt_response'
return 'unknown'

example_type = match_keys(example.keys())
if example_type != 'unknown':
return example_type
else:
# We try to match the keys in lowercase to give a more informative error message.
if match_keys([key.lower() for key in example.keys()]) != 'unknown':
raise ExampleDatasetKeyCaseError(str(example.keys()))
raise UnknownExampleTypeError(str(example.keys()))


9 changes: 9 additions & 0 deletions llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
@@ -172,6 +172,15 @@ def __init__(self, example_keys: str) -> None:

super().__init__(message, example_keys=example_keys)

class ExampleDatasetKeyCaseError(UserError):
"""Error thrown when keys in a dataset example are not in lowercase, potentially matching the expected example types if corrected."""

def __init__(self, example_keys: str) -> None:
message = (
f"Found keys {example_keys} in the dataset. All keys in datasets must be in lowercase. "
f"Please ensure all keys are formatted correctly."
)
super().__init__(message, example_keys=example_keys)

class NotEnoughChatDataError(UserError):
"""Error thrown when there is not enough chat data to train a model."""

0 comments on commit 051fcab

Please sign in to comment.