From 7c9146ef4d21840830ac9ca0ba5aa33f918091ae Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Wed, 31 Jul 2024 10:22:59 +0200 Subject: [PATCH] remove code for 'manual download' script datasets (#3005) * remove code for 'manual download' script datasets * remove mention of ManualDownloadError in openapi --- docs/source/openapi.json | 93 ------------------- libs/libcommon/src/libcommon/exceptions.py | 8 -- .../job_runners/config/parquet_and_info.py | 45 +-------- .../worker/job_runners/config/split_names.py | 6 -- .../config/test_parquet_and_info.py | 14 --- .../job_runners/config/test_split_names.py | 20 +--- 6 files changed, 2 insertions(+), 184 deletions(-) diff --git a/docs/source/openapi.json b/docs/source/openapi.json index a7f76dfb22..bc22ce70b9 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -1845,99 +1845,6 @@ } ] } - }, - "one of the subsets has an error": { - "summary": "one of the subsets require manual download, and fails to give the split names", - "description": "Try with https://datasets-server.huggingface.co/splits?dataset=superb.", - "value": { - "splits": [ - { - "dataset": "superb", - "config": "asr", - "split": "train" - }, - { - "dataset": "superb", - "config": "asr", - "split": "validation" - }, - { - "dataset": "superb", - "config": "asr", - "split": "test" - }, - { - "dataset": "superb", - "config": "ic", - "split": "train" - }, - { - "dataset": "superb", - "config": "ic", - "split": "validation" - }, - { - "dataset": "superb", - "config": "ic", - "split": "test" - }, - { - "dataset": "superb", - "config": "ks", - "split": "train" - }, - { - "dataset": "superb", - "config": "ks", - "split": "validation" - }, - { - "dataset": "superb", - "config": "ks", - "split": "test" - }, - { - "dataset": "superb", - "config": "sd", - "split": "train" - }, - { "dataset": "superb", "config": "sd", "split": "dev" }, - { - "dataset": "superb", - "config": "sd", - "split": "test" - }, - { - "dataset": "superb", - "config": "si", - "split": "train" - }, - { - "dataset": "superb", - "config": "si", - "split": "validation" - }, - { "dataset": "superb", "config": "si", "split": "test" } - ], - "pending": [], - "failed": [ - { - "dataset": "superb", - "config": "er", - "error": { - "error": "dataset=superb requires manual download.", - "cause_exception": "ManualDownloadError", - "cause_message": " The dataset superb with config er requires manual data.\n Please follow the manual download instructions:\n\nPlease download the IEMOCAP dataset after submitting the request form here:\nhttps://sail.usc.edu/iemocap/iemocap_release.htm\nHaving downloaded the dataset you can extract it with `tar -xvzf IEMOCAP_full_release.tar.gz`\nwhich should create a folder called `IEMOCAP_full_release`\n\n Manual data can be loaded with:\n datasets.load_dataset(\"superb\", data_dir=\"\")", - "cause_traceback": [ - "Traceback (most recent call last):\n", - " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 299, in raise_if_requires_manual_download\n builder._check_manual_download(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/builder.py\", line 932, in _check_manual_download\n raise ManualDownloadError(\n", - "datasets.builder.ManualDownloadError: The dataset superb with config er requires manual data.\n Please follow the manual download instructions:\n\nPlease download the IEMOCAP dataset after submitting the request form here:\nhttps://sail.usc.edu/iemocap/iemocap_release.htm\nHaving downloaded the dataset you can extract it with `tar -xvzf IEMOCAP_full_release.tar.gz`\nwhich should create a folder called `IEMOCAP_full_release`\n\n Manual data can be loaded with:\n datasets.load_dataset(\"superb\", data_dir=\"\")\n" - ] - } - } - ] - } } } } diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index 934d33c75b..b0b00e042d 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -81,7 +81,6 @@ def as_response(self) -> ErrorResponse: "DatasetGenerationError", "DatasetGenerationCastError", "DatasetInBlockListError", - "DatasetManualDownloadError", "DatasetModuleNotInstalledError", "DatasetNotFoundError", "DatasetScriptError", @@ -200,13 +199,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetGenerationCastError", cause, True) -class DatasetManualDownloadError(CacheableError): - """The dataset requires manual download.""" - - def __init__(self, message: str, cause: Optional[BaseException] = None): - super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetManualDownloadError", cause, True) - - class DatasetModuleNotInstalledError(CacheableError): """The dataset tries to import a module that is not installed.""" diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index 6d3b5748e3..49e4dd8a26 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -26,7 +26,7 @@ import requests from datasets import DownloadConfig, Features, load_dataset_builder from datasets.arrow_writer import ParquetWriter -from datasets.builder import DatasetBuilder, ManualDownloadError +from datasets.builder import DatasetBuilder from datasets.data_files import EmptyDatasetError as _EmptyDatasetError from datasets.download import StreamingDownloadManager from datasets.packaged_modules.parquet.parquet import Parquet as ParquetBuilder @@ -66,7 +66,6 @@ CreateCommitError, DatasetGenerationCastError, DatasetGenerationError, - DatasetManualDownloadError, DatasetNotFoundError, DatasetWithScriptNotSupportedError, EmptyDatasetError, @@ -263,41 +262,6 @@ def _is_too_big_from_datasets( return bool(dataset_size > max_dataset_size_bytes) -def raise_if_requires_manual_download( - builder: DatasetBuilder, - hf_endpoint: str, - hf_token: Optional[str], -) -> None: - """ - Raise an error if the dataset requires manual download. - - Args: - builder (`datasets.builder.DatasetBuilder`): - A dataset builder instance to check. - hf_endpoint (`str`): - The Hub endpoint (for example: "https://huggingface.co"). - hf_token (`str`, *optional*): - An app authentication token with read access to all the datasets. - - Raises: - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError): - If the datasets.config.HF_ENDPOINT is not set to the expected value. - [~`libcommon.exceptions.DatasetManualDownloadError`]: - If the dataset requires manual download. - """ - if datasets.config.HF_ENDPOINT != hf_endpoint: - raise ValueError( - f"Invalid datasets.config.HF_ENDPOINT value: '{datasets.config.HF_ENDPOINT}'. Please set it to:" - f" '{hf_endpoint}'." - ) - try: - builder._check_manual_download( - StreamingDownloadManager(base_path=builder.base_path, download_config=DownloadConfig(token=hf_token)) - ) - except ManualDownloadError as err: - raise DatasetManualDownloadError(f"dataset={builder.repo_id} requires manual download.", cause=err) from err - - def is_dataset_too_big( dataset_info: DatasetInfo, builder: DatasetBuilder, @@ -1442,8 +1406,6 @@ def compute_config_parquet_and_info_response( If the previous step gave an error. [~`libcommon.exceptions.CreateCommitError`]: If one of the commits could not be created on the Hub. - [~`libcommon.exceptions.DatasetManualDownloadError`]: - If the dataset requires manual download. [~`libcommon.exceptions.EmptyDatasetError`]: The dataset is empty. [~`libcommon.exceptions.ConfigNamesError`]: @@ -1551,11 +1513,6 @@ def compute_config_parquet_and_info_response( writer_batch_size=writer_batch_size, ) else: - raise_if_requires_manual_download( - builder=builder, - hf_endpoint=hf_endpoint, - hf_token=hf_token, - ) dataset_info = hf_api.dataset_info(repo_id=dataset, revision=source_revision, files_metadata=True) if is_dataset_too_big( dataset_info=dataset_info, diff --git a/services/worker/src/worker/job_runners/config/split_names.py b/services/worker/src/worker/job_runners/config/split_names.py index 60a79000ab..454633da37 100644 --- a/services/worker/src/worker/job_runners/config/split_names.py +++ b/services/worker/src/worker/job_runners/config/split_names.py @@ -5,11 +5,9 @@ from typing import Optional from datasets import get_dataset_split_names -from datasets.builder import ManualDownloadError from datasets.data_files import EmptyDatasetError as _EmptyDatasetError from libcommon.dtos import FullSplitItem from libcommon.exceptions import ( - DatasetManualDownloadError, DatasetWithScriptNotSupportedError, DatasetWithTooManySplitsError, EmptyDatasetError, @@ -52,8 +50,6 @@ def compute_split_names_from_streaming_response( Maximum number of splits. Raises: - [~`libcommon.exceptions.DatasetManualDownloadError`]: - If the dataset requires manual download. [~`libcommon.exceptions.EmptyDatasetError`]: The dataset is empty. [~`libcommon.exceptions.SplitsNamesError`]: @@ -79,8 +75,6 @@ def compute_split_names_from_streaming_response( trust_remote_code=resolve_trust_remote_code(dataset=dataset, allow_list=dataset_scripts_allow_list), ) ] - except ManualDownloadError as err: - raise DatasetManualDownloadError(f"{dataset=} requires manual download.", cause=err) from err except _EmptyDatasetError as err: raise EmptyDatasetError("The dataset is empty.", cause=err) from err except Exception as err: diff --git a/services/worker/tests/job_runners/config/test_parquet_and_info.py b/services/worker/tests/job_runners/config/test_parquet_and_info.py index 52d3a52635..b4c856f30a 100644 --- a/services/worker/tests/job_runners/config/test_parquet_and_info.py +++ b/services/worker/tests/job_runners/config/test_parquet_and_info.py @@ -28,9 +28,6 @@ from datasets.utils.py_utils import asdict from huggingface_hub.hf_api import CommitOperationAdd, HfApi from libcommon.dtos import JobInfo, JobParams, Priority -from libcommon.exceptions import ( - DatasetManualDownloadError, -) from libcommon.queue.jobs import Queue from libcommon.resources import CacheMongoResource, QueueMongoResource from libcommon.simple_cache import upsert_response @@ -55,7 +52,6 @@ limit_parquet_writes, list_generated_parquet_files, parse_repo_filename, - raise_if_requires_manual_download, stream_convert_to_parquet, track_reads, ) @@ -228,16 +224,6 @@ def test_compute_legacy_configs( assert updated_repo_configs == {"first"} -def test_raise_if_requires_manual_download(hub_public_manual_download: str, app_config: AppConfig) -> None: - builder = load_dataset_builder(hub_public_manual_download) - with pytest.raises(DatasetManualDownloadError): - raise_if_requires_manual_download( - builder=builder, - hf_endpoint=app_config.common.hf_endpoint, - hf_token=app_config.common.hf_token, - ) - - @pytest.mark.parametrize( "name,expected", [("public", False), ("big", True)], diff --git a/services/worker/tests/job_runners/config/test_split_names.py b/services/worker/tests/job_runners/config/test_split_names.py index 493df6b132..3a7c79ca18 100644 --- a/services/worker/tests/job_runners/config/test_split_names.py +++ b/services/worker/tests/job_runners/config/test_split_names.py @@ -8,11 +8,7 @@ import pytest from libcommon.dtos import Priority -from libcommon.exceptions import ( - CustomError, - DatasetManualDownloadError, - PreviousStepFormatError, -) +from libcommon.exceptions import CustomError, PreviousStepFormatError from libcommon.resources import CacheMongoResource, QueueMongoResource from libcommon.simple_cache import ( CachedArtifactError, @@ -24,7 +20,6 @@ from worker.job_runners.config.split_names import ( ConfigSplitNamesJobRunner, compute_split_names_from_info_response, - compute_split_names_from_streaming_response, ) from worker.resources import LibrariesResource @@ -231,19 +226,6 @@ def test_compute_split_names_from_streaming_response( assert response_dict["cause_traceback"][0] == "Traceback (most recent call last):\n" -def test_compute_split_names_from_streaming_response_raises( - hub_public_manual_download: str, app_config: AppConfig -) -> None: - with pytest.raises(DatasetManualDownloadError): - compute_split_names_from_streaming_response( - hub_public_manual_download, - "default", - max_number=999, - hf_token=app_config.common.hf_token, - dataset_scripts_allow_list=[hub_public_manual_download], - ) - - def test_compute(app_config: AppConfig, get_job_runner: GetJobRunner, hub_public_csv: str) -> None: dataset = hub_public_csv config, _ = get_default_config_split()