Skip to content

Commit

Permalink
Catch bad data prep (#1644)
Browse files Browse the repository at this point in the history
Co-authored-by: Daniel King <[email protected]>
  • Loading branch information
milocress and dakinggg authored Nov 15, 2024
1 parent 767a7df commit db7a5c0
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 71 deletions.
32 changes: 30 additions & 2 deletions llmfoundry/command_utils/data_prep/convert_delta_to_json.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

import json
import logging
import os
import re
Expand All @@ -27,6 +28,7 @@
FaultyDataPrepCluster,
InsufficientPermissionsError,
MalformedUCTableError,
StoragePermissionError,
UCNotEnabledError,
)

Expand Down Expand Up @@ -681,7 +683,7 @@ def fetch_DT(

log.info(f'Directory {json_output_folder} created.')

# validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True
# Validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True.
method, dbsql, sparkSession = validate_and_get_cluster_info(
cluster_id=cluster_id,
databricks_host=DATABRICKS_HOST,
Expand Down Expand Up @@ -732,12 +734,38 @@ def fetch_DT(
if dbsql is not None:
dbsql.close()

# combine downloaded jsonl into one big jsonl for IFT
# Combine downloaded jsonl into one big jsonl for IFT.
iterative_combine_jsons(
json_output_folder,
os.path.join(json_output_folder, json_output_filename),
)

_validate_written_file(
json_output_folder,
json_output_filename,
delta_table_name,
)


def _validate_written_file(
json_output_folder: str,
json_output_filename: str,
delta_table_name: str,
):
# Validate downloaded dataset is actually downloaded.
with open(os.path.join(json_output_folder, json_output_filename)) as f:
is_empty = True
for line in f.readlines():
is_empty = False
try:
json.loads(line)
except Exception as e:
raise ValueError(f'Line is not valid json: {line}') from e
if is_empty:
raise StoragePermissionError(
f'Unable to download {delta_table_name}, check network permissions.',
)


def _check_imports():
try:
Expand Down
4 changes: 3 additions & 1 deletion llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,8 +409,10 @@ def __init__(self, output_folder: str) -> None:
class MisconfiguredHfDatasetError(UserError):
"""Error thrown when a HuggingFace dataset is misconfigured."""

def __init__(self, dataset_name: str, split: str) -> None:
def __init__(self, dataset_name: str, split: Optional[str] = None) -> None:
message = f'Your dataset (name={dataset_name}, split={split}) is misconfigured. ' + \
'Please check your dataset format and make sure you can load your dataset locally.' \
if split is not None else f'Your dataset (name={dataset_name}) is misconfigured. ' + \
'Please check your dataset format and make sure you can load your dataset locally.'
super().__init__(message, dataset_name=dataset_name, split=split)

Expand Down
Loading

0 comments on commit db7a5c0

Please sign in to comment.