Skip to content

Commit

Permalink
support more than 10k parquet files in config-parquet-metadata (#2770)
Browse files Browse the repository at this point in the history
  • Loading branch information
severo authored May 3, 2024
1 parent 906fec5 commit 4afa2ce
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,14 @@ def create_parquet_metadata_file_from_remote_parquet(
parquet_file = get_parquet_file(url=parquet_file_item["url"], fs=fs, hf_token=hf_token)
except Exception as e:
raise FileSystemError(f"Could not read the parquet files: {e}") from e
split = parquet_file_item["url"].split("/")[-2]
# ^ https://github.com/huggingface/dataset-viewer/issues/2768
# to support more than 10k parquet files, in which case, instead of "train" for example,
# the subdirectories are "train-part0", "train-part1", "train-part2", etc.
parquet_metadata_subpath = create_parquet_metadata_file(
dataset=parquet_file_item["dataset"],
config=parquet_file_item["config"],
split=parquet_file_item["split"],
split=split,
parquet_file_metadata=parquet_file.metadata,
filename=parquet_file_item["filename"],
parquet_metadata_directory=parquet_metadata_directory,
Expand Down
81 changes: 73 additions & 8 deletions services/worker/tests/job_runners/config/test_parquet_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,20 @@ def _get_job_runner(
ConfigParquetResponse(
parquet_files=[
SplitHubFile(
dataset="ok", config="config_1", split="train", url="url1", filename="filename1", size=0
dataset="ok",
config="config_1",
split="train",
url="https://url1/train/0000.parquet",
filename="filename1",
size=0,
),
SplitHubFile(
dataset="ok", config="config_1", split="train", url="url2", filename="filename2", size=0
dataset="ok",
config="config_1",
split="train",
url="https://url1/train/0001.parquet",
filename="filename2",
size=0,
),
],
partial=False,
Expand All @@ -111,7 +121,7 @@ def _get_job_runner(
dataset="ok",
config="config_1",
split="train",
url="url1",
url="https://url1/train/0000.parquet",
filename="filename1",
size=0,
num_rows=3,
Expand All @@ -121,7 +131,7 @@ def _get_job_runner(
dataset="ok",
config="config_1",
split="train",
url="url2",
url="https://url1/train/0001.parquet",
filename="filename2",
size=0,
num_rows=3,
Expand Down Expand Up @@ -161,15 +171,15 @@ def _get_job_runner(
dataset="with_features",
config="config_1",
split="train",
url="url1",
url="https://url1/train/0000.parquet",
filename="filename1",
size=0,
),
SplitHubFile(
dataset="with_features",
config="config_1",
split="train",
url="url2",
url="https://url1/train/0001.parquet",
filename="filename2",
size=0,
),
Expand All @@ -184,7 +194,7 @@ def _get_job_runner(
dataset="with_features",
config="config_1",
split="train",
url="url1",
url="https://url1/train/0000.parquet",
filename="filename1",
size=0,
num_rows=3,
Expand All @@ -194,7 +204,7 @@ def _get_job_runner(
dataset="with_features",
config="config_1",
split="train",
url="url2",
url="https://url1/train/0001.parquet",
filename="filename2",
size=0,
num_rows=3,
Expand All @@ -206,6 +216,61 @@ def _get_job_runner(
),
False,
),
(
"more_than_10k_files",
"config_1",
HTTPStatus.OK,
ConfigParquetResponse(
parquet_files=[
SplitHubFile(
dataset="with_features",
config="config_1",
split="train",
url="https://url1/train-part0/0000.parquet",
filename="filename1",
size=0,
),
SplitHubFile(
dataset="with_features",
config="config_1",
split="train",
url="https://url1/train-part1/0000.parquet",
filename="filename2",
size=0,
),
],
partial=False,
features=Features({"a": Value("string")}).to_dict(),
),
None,
ConfigParquetMetadataResponse(
parquet_files_metadata=[
ParquetFileMetadataItem(
dataset="with_features",
config="config_1",
split="train",
url="https://url1/train-part0/0000.parquet",
filename="filename1",
size=0,
num_rows=3,
parquet_metadata_subpath="with_features/--/config_1/train-part0/filename1",
),
ParquetFileMetadataItem(
dataset="with_features",
config="config_1",
split="train",
url="https://url1/train-part1/0000.parquet",
filename="filename2",
size=0,
num_rows=3,
parquet_metadata_subpath="with_features/--/config_1/train-part1/filename2",
),
],
partial=False,
features=Features({"a": Value("string")}).to_dict(),
),
False,
),
],
)
def test_compute(
Expand Down

0 comments on commit 4afa2ce

Please sign in to comment.