Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
allow case insensitive exampple keys

tmp

fix

fix

revert

fix

fix formatting

fix
  • Loading branch information
Ubuntu committed May 31, 2024
1 parent fb9a225 commit 42fcd8d
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 13 deletions.
36 changes: 23 additions & 13 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
import logging
import os
import warnings
from collections.abc import Mapping
from collections.abc import KeysView, Mapping
from functools import partial
from pathlib import Path
from typing import (
Expand Down Expand Up @@ -71,6 +71,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
ALLOWED_RESPONSE_KEYS,
ChatTemplateError,
ConsecutiveRepeatedChatRolesError,
ExampleDatasetKeyCaseError,
IncorrectMessageKeyQuantityError,
InvalidContentTypeError,
InvalidFileExtensionError,
Expand Down Expand Up @@ -134,22 +135,31 @@ def _get_example_type(example: Example) -> ExampleType:
raise TypeError(
f'Expected example to be a Mapping, but found {type(example)}',
)
if (
len(example.keys()) == 1 and any(

def match_keys(keys: KeysView) -> ExampleType:
if len(keys) == 1 and any(
allowed_message_key in example
for allowed_message_key in ALLOWED_MESSAGES_KEYS
)
):
return 'chat'
elif (
len(example.keys()) == 2 and
any(p in example for p in ALLOWED_PROMPT_KEYS) and
any(r in example for r in ALLOWED_RESPONSE_KEYS)
):
return 'prompt_response'
else:
):
return 'chat'
elif (
len(example.keys()) == 2 and
any(p in example for p in ALLOWED_PROMPT_KEYS) and
any(r in example for r in ALLOWED_RESPONSE_KEYS)
):
return 'prompt_response'
raise UnknownExampleTypeError(str(example.keys()))

try:
example_type = match_keys(example.keys())
except UnknownExampleTypeError:
# We try to match the keys in lower case again.
example_lower = {key.lower(): value for key, value in example.items()}
match_keys(example_lower.keys())
# If there is a match then we let the user know that the keys are case senssitive.
raise ExampleDatasetKeyCaseError(str(example.keys()))
return example_type


def _is_empty_or_nonexistent(dirpath: str) -> bool:
"""Check if a directory is empty or non-existent.
Expand Down
13 changes: 13 additions & 0 deletions llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,19 @@ def __init__(self, example_keys: str) -> None:

super().__init__(message, example_keys=example_keys)

class ExampleDatasetKeyCaseError(UserError):
"""Error thrown when keys in a dataset example are not in lowercase.
This error checks for keys that could potentially match the expected example types if corrected.
"""


def __init__(self, example_keys: str) -> None:
message = (
f"Found keys {example_keys} in the dataset. All keys in datasets must be in lowercase. "
f"Please ensure all keys are formatted correctly."
)
super().__init__(message, example_keys=example_keys)

class NotEnoughChatDataError(UserError):
"""Error thrown when there is not enough chat data to train a model."""
Expand Down

0 comments on commit 42fcd8d

Please sign in to comment.