Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
allow case insensitive exampple keys

tmp

fix

fix

revert

fix

fix formatting

fix

format

format
  • Loading branch information
Ubuntu committed Jun 1, 2024
1 parent fb9a225 commit d1561d3
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 18 deletions.
36 changes: 23 additions & 13 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
import logging
import os
import warnings
from collections.abc import Mapping
from collections.abc import KeysView, Mapping
from functools import partial
from pathlib import Path
from typing import (
Expand Down Expand Up @@ -71,6 +71,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
ALLOWED_RESPONSE_KEYS,
ChatTemplateError,
ConsecutiveRepeatedChatRolesError,
ExampleDatasetKeyCaseError,
IncorrectMessageKeyQuantityError,
InvalidContentTypeError,
InvalidFileExtensionError,
Expand Down Expand Up @@ -134,22 +135,31 @@ def _get_example_type(example: Example) -> ExampleType:
raise TypeError(
f'Expected example to be a Mapping, but found {type(example)}',
)
if (
len(example.keys()) == 1 and any(

def match_keys(keys: KeysView) -> ExampleType:
if len(keys) == 1 and any(
allowed_message_key in example
for allowed_message_key in ALLOWED_MESSAGES_KEYS
)
):
return 'chat'
elif (
len(example.keys()) == 2 and
any(p in example for p in ALLOWED_PROMPT_KEYS) and
any(r in example for r in ALLOWED_RESPONSE_KEYS)
):
return 'prompt_response'
else:
):
return 'chat'
elif (
len(example.keys()) == 2 and
any(p in example for p in ALLOWED_PROMPT_KEYS) and
any(r in example for r in ALLOWED_RESPONSE_KEYS)
):
return 'prompt_response'
raise UnknownExampleTypeError(str(example.keys()))

try:
example_type = match_keys(example.keys())
except UnknownExampleTypeError:
# We try to match the keys in lower case again.
example_lower = {key.lower(): value for key, value in example.items()}
match_keys(example_lower.keys())
# If there is a match then we let the user know that the keys are case senssitive.
raise ExampleDatasetKeyCaseError(str(example.keys()))
return example_type


def _is_empty_or_nonexistent(dirpath: str) -> bool:
"""Check if a directory is empty or non-existent.
Expand Down
24 changes: 19 additions & 5 deletions llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

"""Custom exceptions for the LLMFoundry."""
from typing import Any, Dict, List, Literal, Optional, Union

Expand Down Expand Up @@ -61,10 +60,10 @@ def __init__(self, message: str, **kwargs: Any) -> None:
def __reduce__(self):
"""Adjust the reduce behavior for pickling.
Because we have custom exception subclasses with constructor args, we
need to adjust the reduce behavior to ensure that the exception can be
pickled. This allows error propagation across processes in
multiprocessing.
Because we have custom exception subclasses with constructor
args, we need to adjust the reduce behavior to ensure that the
exception can be pickled. This allows error propagation across
processes in multiprocessing.
"""
if self.__class__ == BaseContextualError:
raise NotImplementedError(
Expand Down Expand Up @@ -173,6 +172,21 @@ def __init__(self, example_keys: str) -> None:
super().__init__(message, example_keys=example_keys)


class ExampleDatasetKeyCaseError(UserError):
"""Error thrown when keys in a dataset example are not in lowercase.
This error checks for keys that could potentially match the expected
example types if corrected.
"""

def __init__(self, example_keys: str) -> None:
message = (
f"Found keys {example_keys} in the dataset. All keys in datasets must be in lowercase. "
f"Please ensure all keys are formatted correctly."
)
super().__init__(message, example_keys=example_keys)


class NotEnoughChatDataError(UserError):
"""Error thrown when there is not enough chat data to train a model."""

Expand Down

0 comments on commit d1561d3

Please sign in to comment.