diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 6b4bd25936..6d37d957e1 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -71,6 +71,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: ALLOWED_MESSAGES_KEYS, ALLOWED_PROMPT_KEYS, ALLOWED_RESPONSE_KEYS, + BadDatasetSplitError, ChatTemplateError, ConsecutiveRepeatedChatRolesError, DatasetTooSmallError, @@ -1047,6 +1048,14 @@ def dataset_mapper(example: dict): dataset_name=dataset_name, split=split, ) from error + elif isinstance(error, ValueError) and 'Split name should match' in str( + error, + ): + log.error('Huggingface split ValueError during data prep.') + raise BadDatasetSplitError( + dataset_name=dataset_name, + split=split, + ) from error if error is not None: log.error('Error during data prep') raise error diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 53d3baebfc..7f013a7037 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -417,6 +417,18 @@ def __init__(self, dataset_name: str, split: Optional[str] = None) -> None: super().__init__(message, dataset_name=dataset_name, split=split) +class BadDatasetSplitError(UserError): + """Error thrown when a HuggingFace dataset is misconfigured.""" + + def __init__(self, dataset_name: str, split: Optional[str] = None) -> None: + reg = r"^\\w+(\\.\\w+)*$" + message = f'Your dataset (name={dataset_name}, split={split}) has an invalid split. ' + \ + f'Please check your split name to make sure it matches the pattern "{reg}"' \ + if split is not None else f'Your dataset (name={dataset_name}) is misconfigured. ' + \ + f'Please check your split name to make sure it matches the pattern "{reg}"' + super().__init__(message, dataset_name=dataset_name, split=split) + + class InvalidDatasetError(UserError): """Error thrown when a dataset contains no valid samples for training."""