Skip to content

Commit

Permalink
make custom exceptions for FileNotFound in dataloader and `convert_…
Browse files Browse the repository at this point in the history
…text_to_mds`
  • Loading branch information
angel-ruiz7 committed May 31, 2024
1 parent fb9a225 commit 200693d
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 12 deletions.
10 changes: 2 additions & 8 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from llmfoundry.data.text_data import build_streams
from llmfoundry.utils.config_utils import to_dict_container
from llmfoundry.utils.exceptions import (
DatasetMissingFileError,
MissingHuggingFaceURLSplitError,
NotEnoughDatasetSamplesError,
)
Expand Down Expand Up @@ -541,14 +542,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
get_file(path=name, destination=destination, overwrite=True)
except FileNotFoundError as e:
if extension == SUPPORTED_EXTENSIONS[-1]:
files_searched = [
f'{name}/{split}{ext}' for ext in SUPPORTED_EXTENSIONS
]
raise FileNotFoundError(
f'Could not find a file with any of ' + \
f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \
f'at {files_searched}',
) from e
raise DatasetMissingFileError(file_name=f"name/{split}") from e
else:
log.debug(
f'Could not find {name}, looking for another extension',
Expand Down
19 changes: 19 additions & 0 deletions llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
"""Custom exceptions for the LLMFoundry."""
from typing import Any, Dict, List, Literal, Optional, Union

from llmfoundry.data.finetuning.tasks import SUPPORTED_EXTENSIONS

__all__ = [
'ALLOWED_RESPONSE_KEYS',
'ALLOWED_PROMPT_KEYS',
Expand Down Expand Up @@ -354,3 +356,20 @@ class RunTimeoutError(InternalError):
def __init__(self, timeout: int) -> None:
message = f'Run timed out after {timeout} seconds.'
super().__init__(message, timeout=timeout)


class DatasetMissingFileError(UserError):
"""Error thrown when a dataset cannot find a file."""
def __init__(self, file_name: List[str]) -> None:
message = "Could not find the file '{file_name}' with any of the supported extensions: "
message += ", ".join(SUPPORTED_EXTENSIONS) + '.'
message += " Please check your train / eval data and try again."
super().__init__(message, file_name=file_name)


class DatasetInvalidFolderError(UserError):
"""Error thrown when a dataset folder is invalid."""
def __init__(self, folder_path: str) -> None:
message = f"Could not find objects at the path '{folder_path}'. "
message += "Please check your `input_folder` and try again."
super().__init__(message, folder_path=folder_path)
12 changes: 8 additions & 4 deletions scripts/data_prep/convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
merge_shard_groups,
)
from llmfoundry.utils.exceptions import (
DatasetInvalidFolderError,
InputFolderMissingDataError,
OutputFolderNotEmptyError,
)
Expand Down Expand Up @@ -232,10 +233,13 @@ def get_object_names(input_folder: str) -> List[str]:
object_store = maybe_create_object_store_from_uri(input_folder)
if object_store is not None:
_, _, folder_prefix = parse_uri(input_folder)
names = [
name for name in object_store.list_objects(folder_prefix)
if name.endswith('.txt')
]
try:
names = [
name for name in object_store.list_objects(folder_prefix)
if name.endswith('.txt')
]
except FileNotFoundError as e:
raise DatasetInvalidFolderError(input_folder) from e
else:
# input_folder is a local folder
names = [
Expand Down

0 comments on commit 200693d

Please sign in to comment.