allow case insensitive exampple keys

tmp fix fix revert
mosaicml · May 31, 2024 · 051fcab · 051fcab
1 parent 69d9d29
commit 051fcab
Showing 3 changed files with 31 additions and 13 deletions.
diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
@@ -542,7 +542,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
             except FileNotFoundError as e:
                 if extension == SUPPORTED_EXTENSIONS[-1]:
                     files_searched = [
-                        f'{name[:-len(extension)]}{ext}' for ext in SUPPORTED_EXTENSIONS
+                        f'{name}/{split}{ext}' for ext in SUPPORTED_EXTENSIONS
                     ]
                     raise FileNotFoundError(
                         f'Could not find a file with any of ' + \

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -35,7 +35,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 import logging
 import os
 import warnings
-from collections.abc import Mapping
+from collections.abc import KeysView, Mapping
 from functools import partial
 from pathlib import Path
 from typing import (
@@ -71,6 +71,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
     ALLOWED_RESPONSE_KEYS,
     ChatTemplateError,
     ConsecutiveRepeatedChatRolesError,
+    ExampleDatasetKeyCaseError,
     IncorrectMessageKeyQuantityError,
     InvalidContentTypeError,
     InvalidFileExtensionError,
@@ -134,20 +135,28 @@ def _get_example_type(example: Example) -> ExampleType:
         raise TypeError(
             f'Expected example to be a Mapping, but found {type(example)}',
         )
-    if (
-        len(example.keys()) == 1 and any(
+
+    def match_keys(keys: KeysView) -> str:
+        if len(keys) == 1 and any(
             allowed_message_key in example
             for allowed_message_key in ALLOWED_MESSAGES_KEYS
-        )
-    ):
-        return 'chat'
-    elif (
-        len(example.keys()) == 2 and
-        any(p in example for p in ALLOWED_PROMPT_KEYS) and
-        any(r in example for r in ALLOWED_RESPONSE_KEYS)
-    ):
-        return 'prompt_response'
+        ):
+            return 'chat'
+        elif (
+            len(example.keys()) == 2 and
+            any(p in example for p in ALLOWED_PROMPT_KEYS) and
+            any(r in example for r in ALLOWED_RESPONSE_KEYS)
+        ):
+            return 'prompt_response'
+        return 'unknown'
+
+    example_type = match_keys(example.keys())
+    if example_type != 'unknown':
+        return example_type
     else:
+        # We try to match the keys in lowercase to give a more informative error message.
+        if match_keys([key.lower() for key in example.keys()]) != 'unknown':
+            raise ExampleDatasetKeyCaseError(str(example.keys()))
         raise UnknownExampleTypeError(str(example.keys()))
 
 

diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
@@ -172,6 +172,15 @@ def __init__(self, example_keys: str) -> None:
 
         super().__init__(message, example_keys=example_keys)
 
+class ExampleDatasetKeyCaseError(UserError):
+    """Error thrown when keys in a dataset example are not in lowercase, potentially matching the expected example types if corrected."""
+
+    def __init__(self, example_keys: str) -> None:
+        message = (
+            f"Found keys {example_keys} in the dataset. All keys in datasets must be in lowercase. "
+            f"Please ensure all keys are formatted correctly."
+        )
+        super().__init__(message, example_keys=example_keys)
 
 class NotEnoughChatDataError(UserError):
     """Error thrown when there is not enough chat data to train a model."""