Skip to content

Commit

Permalink
remove code for 'manual download' script datasets (#3005)
Browse files Browse the repository at this point in the history
* remove code for 'manual download' script datasets

* remove mention of ManualDownloadError in openapi
  • Loading branch information
severo authored Jul 31, 2024
1 parent df29e85 commit 7c9146e
Show file tree
Hide file tree
Showing 6 changed files with 2 additions and 184 deletions.
93 changes: 0 additions & 93 deletions docs/source/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -1845,99 +1845,6 @@
}
]
}
},
"one of the subsets has an error": {
"summary": "one of the subsets require manual download, and fails to give the split names",
"description": "Try with https://datasets-server.huggingface.co/splits?dataset=superb.",
"value": {
"splits": [
{
"dataset": "superb",
"config": "asr",
"split": "train"
},
{
"dataset": "superb",
"config": "asr",
"split": "validation"
},
{
"dataset": "superb",
"config": "asr",
"split": "test"
},
{
"dataset": "superb",
"config": "ic",
"split": "train"
},
{
"dataset": "superb",
"config": "ic",
"split": "validation"
},
{
"dataset": "superb",
"config": "ic",
"split": "test"
},
{
"dataset": "superb",
"config": "ks",
"split": "train"
},
{
"dataset": "superb",
"config": "ks",
"split": "validation"
},
{
"dataset": "superb",
"config": "ks",
"split": "test"
},
{
"dataset": "superb",
"config": "sd",
"split": "train"
},
{ "dataset": "superb", "config": "sd", "split": "dev" },
{
"dataset": "superb",
"config": "sd",
"split": "test"
},
{
"dataset": "superb",
"config": "si",
"split": "train"
},
{
"dataset": "superb",
"config": "si",
"split": "validation"
},
{ "dataset": "superb", "config": "si", "split": "test" }
],
"pending": [],
"failed": [
{
"dataset": "superb",
"config": "er",
"error": {
"error": "dataset=superb requires manual download.",
"cause_exception": "ManualDownloadError",
"cause_message": " The dataset superb with config er requires manual data.\n Please follow the manual download instructions:\n\nPlease download the IEMOCAP dataset after submitting the request form here:\nhttps://sail.usc.edu/iemocap/iemocap_release.htm\nHaving downloaded the dataset you can extract it with `tar -xvzf IEMOCAP_full_release.tar.gz`\nwhich should create a folder called `IEMOCAP_full_release`\n\n Manual data can be loaded with:\n datasets.load_dataset(\"superb\", data_dir=\"<path/to/manual/data>\")",
"cause_traceback": [
"Traceback (most recent call last):\n",
" File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 299, in raise_if_requires_manual_download\n builder._check_manual_download(\n",
" File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/builder.py\", line 932, in _check_manual_download\n raise ManualDownloadError(\n",
"datasets.builder.ManualDownloadError: The dataset superb with config er requires manual data.\n Please follow the manual download instructions:\n\nPlease download the IEMOCAP dataset after submitting the request form here:\nhttps://sail.usc.edu/iemocap/iemocap_release.htm\nHaving downloaded the dataset you can extract it with `tar -xvzf IEMOCAP_full_release.tar.gz`\nwhich should create a folder called `IEMOCAP_full_release`\n\n Manual data can be loaded with:\n datasets.load_dataset(\"superb\", data_dir=\"<path/to/manual/data>\")\n"
]
}
}
]
}
}
}
}
Expand Down
8 changes: 0 additions & 8 deletions libs/libcommon/src/libcommon/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ def as_response(self) -> ErrorResponse:
"DatasetGenerationError",
"DatasetGenerationCastError",
"DatasetInBlockListError",
"DatasetManualDownloadError",
"DatasetModuleNotInstalledError",
"DatasetNotFoundError",
"DatasetScriptError",
Expand Down Expand Up @@ -200,13 +199,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetGenerationCastError", cause, True)


class DatasetManualDownloadError(CacheableError):
"""The dataset requires manual download."""

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetManualDownloadError", cause, True)


class DatasetModuleNotInstalledError(CacheableError):
"""The dataset tries to import a module that is not installed."""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import requests
from datasets import DownloadConfig, Features, load_dataset_builder
from datasets.arrow_writer import ParquetWriter
from datasets.builder import DatasetBuilder, ManualDownloadError
from datasets.builder import DatasetBuilder
from datasets.data_files import EmptyDatasetError as _EmptyDatasetError
from datasets.download import StreamingDownloadManager
from datasets.packaged_modules.parquet.parquet import Parquet as ParquetBuilder
Expand Down Expand Up @@ -66,7 +66,6 @@
CreateCommitError,
DatasetGenerationCastError,
DatasetGenerationError,
DatasetManualDownloadError,
DatasetNotFoundError,
DatasetWithScriptNotSupportedError,
EmptyDatasetError,
Expand Down Expand Up @@ -263,41 +262,6 @@ def _is_too_big_from_datasets(
return bool(dataset_size > max_dataset_size_bytes)


def raise_if_requires_manual_download(
builder: DatasetBuilder,
hf_endpoint: str,
hf_token: Optional[str],
) -> None:
"""
Raise an error if the dataset requires manual download.
Args:
builder (`datasets.builder.DatasetBuilder`):
A dataset builder instance to check.
hf_endpoint (`str`):
The Hub endpoint (for example: "https://huggingface.co").
hf_token (`str`, *optional*):
An app authentication token with read access to all the datasets.
Raises:
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
If the datasets.config.HF_ENDPOINT is not set to the expected value.
[~`libcommon.exceptions.DatasetManualDownloadError`]:
If the dataset requires manual download.
"""
if datasets.config.HF_ENDPOINT != hf_endpoint:
raise ValueError(
f"Invalid datasets.config.HF_ENDPOINT value: '{datasets.config.HF_ENDPOINT}'. Please set it to:"
f" '{hf_endpoint}'."
)
try:
builder._check_manual_download(
StreamingDownloadManager(base_path=builder.base_path, download_config=DownloadConfig(token=hf_token))
)
except ManualDownloadError as err:
raise DatasetManualDownloadError(f"dataset={builder.repo_id} requires manual download.", cause=err) from err


def is_dataset_too_big(
dataset_info: DatasetInfo,
builder: DatasetBuilder,
Expand Down Expand Up @@ -1442,8 +1406,6 @@ def compute_config_parquet_and_info_response(
If the previous step gave an error.
[~`libcommon.exceptions.CreateCommitError`]:
If one of the commits could not be created on the Hub.
[~`libcommon.exceptions.DatasetManualDownloadError`]:
If the dataset requires manual download.
[~`libcommon.exceptions.EmptyDatasetError`]:
The dataset is empty.
[~`libcommon.exceptions.ConfigNamesError`]:
Expand Down Expand Up @@ -1551,11 +1513,6 @@ def compute_config_parquet_and_info_response(
writer_batch_size=writer_batch_size,
)
else:
raise_if_requires_manual_download(
builder=builder,
hf_endpoint=hf_endpoint,
hf_token=hf_token,
)
dataset_info = hf_api.dataset_info(repo_id=dataset, revision=source_revision, files_metadata=True)
if is_dataset_too_big(
dataset_info=dataset_info,
Expand Down
6 changes: 0 additions & 6 deletions services/worker/src/worker/job_runners/config/split_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@
from typing import Optional

from datasets import get_dataset_split_names
from datasets.builder import ManualDownloadError
from datasets.data_files import EmptyDatasetError as _EmptyDatasetError
from libcommon.dtos import FullSplitItem
from libcommon.exceptions import (
DatasetManualDownloadError,
DatasetWithScriptNotSupportedError,
DatasetWithTooManySplitsError,
EmptyDatasetError,
Expand Down Expand Up @@ -52,8 +50,6 @@ def compute_split_names_from_streaming_response(
Maximum number of splits.
Raises:
[~`libcommon.exceptions.DatasetManualDownloadError`]:
If the dataset requires manual download.
[~`libcommon.exceptions.EmptyDatasetError`]:
The dataset is empty.
[~`libcommon.exceptions.SplitsNamesError`]:
Expand All @@ -79,8 +75,6 @@ def compute_split_names_from_streaming_response(
trust_remote_code=resolve_trust_remote_code(dataset=dataset, allow_list=dataset_scripts_allow_list),
)
]
except ManualDownloadError as err:
raise DatasetManualDownloadError(f"{dataset=} requires manual download.", cause=err) from err
except _EmptyDatasetError as err:
raise EmptyDatasetError("The dataset is empty.", cause=err) from err
except Exception as err:
Expand Down
14 changes: 0 additions & 14 deletions services/worker/tests/job_runners/config/test_parquet_and_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@
from datasets.utils.py_utils import asdict
from huggingface_hub.hf_api import CommitOperationAdd, HfApi
from libcommon.dtos import JobInfo, JobParams, Priority
from libcommon.exceptions import (
DatasetManualDownloadError,
)
from libcommon.queue.jobs import Queue
from libcommon.resources import CacheMongoResource, QueueMongoResource
from libcommon.simple_cache import upsert_response
Expand All @@ -55,7 +52,6 @@
limit_parquet_writes,
list_generated_parquet_files,
parse_repo_filename,
raise_if_requires_manual_download,
stream_convert_to_parquet,
track_reads,
)
Expand Down Expand Up @@ -228,16 +224,6 @@ def test_compute_legacy_configs(
assert updated_repo_configs == {"first"}


def test_raise_if_requires_manual_download(hub_public_manual_download: str, app_config: AppConfig) -> None:
builder = load_dataset_builder(hub_public_manual_download)
with pytest.raises(DatasetManualDownloadError):
raise_if_requires_manual_download(
builder=builder,
hf_endpoint=app_config.common.hf_endpoint,
hf_token=app_config.common.hf_token,
)


@pytest.mark.parametrize(
"name,expected",
[("public", False), ("big", True)],
Expand Down
20 changes: 1 addition & 19 deletions services/worker/tests/job_runners/config/test_split_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,7 @@

import pytest
from libcommon.dtos import Priority
from libcommon.exceptions import (
CustomError,
DatasetManualDownloadError,
PreviousStepFormatError,
)
from libcommon.exceptions import CustomError, PreviousStepFormatError
from libcommon.resources import CacheMongoResource, QueueMongoResource
from libcommon.simple_cache import (
CachedArtifactError,
Expand All @@ -24,7 +20,6 @@
from worker.job_runners.config.split_names import (
ConfigSplitNamesJobRunner,
compute_split_names_from_info_response,
compute_split_names_from_streaming_response,
)
from worker.resources import LibrariesResource

Expand Down Expand Up @@ -231,19 +226,6 @@ def test_compute_split_names_from_streaming_response(
assert response_dict["cause_traceback"][0] == "Traceback (most recent call last):\n"


def test_compute_split_names_from_streaming_response_raises(
hub_public_manual_download: str, app_config: AppConfig
) -> None:
with pytest.raises(DatasetManualDownloadError):
compute_split_names_from_streaming_response(
hub_public_manual_download,
"default",
max_number=999,
hf_token=app_config.common.hf_token,
dataset_scripts_allow_list=[hub_public_manual_download],
)


def test_compute(app_config: AppConfig, get_job_runner: GetJobRunner, hub_public_csv: str) -> None:
dataset = hub_public_csv
config, _ = get_default_config_split()
Expand Down

0 comments on commit 7c9146e

Please sign in to comment.