fix

allow case insensitive exampple keys tmp fix fix revert fix fix formatting fix
mosaicml · May 31, 2024 · 42fcd8d · 42fcd8d
1 parent fb9a225
commit 42fcd8d
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 13 deletions.
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -35,7 +35,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 import logging
 import os
 import warnings
-from collections.abc import Mapping
+from collections.abc import KeysView, Mapping
 from functools import partial
 from pathlib import Path
 from typing import (
@@ -71,6 +71,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
     ALLOWED_RESPONSE_KEYS,
     ChatTemplateError,
     ConsecutiveRepeatedChatRolesError,
+    ExampleDatasetKeyCaseError,
     IncorrectMessageKeyQuantityError,
     InvalidContentTypeError,
     InvalidFileExtensionError,
@@ -134,22 +135,31 @@ def _get_example_type(example: Example) -> ExampleType:
         raise TypeError(
             f'Expected example to be a Mapping, but found {type(example)}',
         )
-    if (
-        len(example.keys()) == 1 and any(
+
+    def match_keys(keys: KeysView) -> ExampleType:
+        if len(keys) == 1 and any(
             allowed_message_key in example
             for allowed_message_key in ALLOWED_MESSAGES_KEYS
-        )
-    ):
-        return 'chat'
-    elif (
-        len(example.keys()) == 2 and
-        any(p in example for p in ALLOWED_PROMPT_KEYS) and
-        any(r in example for r in ALLOWED_RESPONSE_KEYS)
-    ):
-        return 'prompt_response'
-    else:
+        ):
+            return 'chat'
+        elif (
+            len(example.keys()) == 2 and
+            any(p in example for p in ALLOWED_PROMPT_KEYS) and
+            any(r in example for r in ALLOWED_RESPONSE_KEYS)
+        ):
+            return 'prompt_response'
         raise UnknownExampleTypeError(str(example.keys()))
 
+    try:
+        example_type = match_keys(example.keys())
+    except UnknownExampleTypeError:
+        # We try to match the keys in lower case again.
+        example_lower = {key.lower(): value for key, value in example.items()}
+        match_keys(example_lower.keys())
+        # If there is a match then we let the user know that the keys are case senssitive.
+        raise ExampleDatasetKeyCaseError(str(example.keys()))
+    return example_type
+
 
 def _is_empty_or_nonexistent(dirpath: str) -> bool:
     """Check if a directory is empty or non-existent.

diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
@@ -172,6 +172,19 @@ def __init__(self, example_keys: str) -> None:
 
         super().__init__(message, example_keys=example_keys)
 
+class ExampleDatasetKeyCaseError(UserError):
+    """Error thrown when keys in a dataset example are not in lowercase.
+
+    This error checks for keys that could potentially match the expected example types if corrected.
+    """
+
+
+    def __init__(self, example_keys: str) -> None:
+        message = (
+            f"Found keys {example_keys} in the dataset. All keys in datasets must be in lowercase. "
+            f"Please ensure all keys are formatted correctly."
+        )
+        super().__init__(message, example_keys=example_keys)
 
 class NotEnoughChatDataError(UserError):
     """Error thrown when there is not enough chat data to train a model."""