From 7c9146ef4d21840830ac9ca0ba5aa33f918091ae Mon Sep 17 00:00:00 2001
From: Sylvain Lesage <sylvain.lesage@huggingface.co>
Date: Wed, 31 Jul 2024 10:22:59 +0200
Subject: [PATCH] remove code for 'manual download' script datasets (#3005)

* remove code for 'manual download' script datasets

* remove mention of ManualDownloadError in openapi
---
 docs/source/openapi.json                      | 93 -------------------
 libs/libcommon/src/libcommon/exceptions.py    |  8 --
 .../job_runners/config/parquet_and_info.py    | 45 +--------
 .../worker/job_runners/config/split_names.py  |  6 --
 .../config/test_parquet_and_info.py           | 14 ---
 .../job_runners/config/test_split_names.py    | 20 +---
 6 files changed, 2 insertions(+), 184 deletions(-)

diff --git a/docs/source/openapi.json b/docs/source/openapi.json
index a7f76dfb22..bc22ce70b9 100644
--- a/docs/source/openapi.json
+++ b/docs/source/openapi.json
@@ -1845,99 +1845,6 @@
                         }
                       ]
                     }
-                  },
-                  "one of the subsets has an error": {
-                    "summary": "one of the subsets require manual download, and fails to give the split names",
-                    "description": "Try with https://datasets-server.huggingface.co/splits?dataset=superb.",
-                    "value": {
-                      "splits": [
-                        {
-                          "dataset": "superb",
-                          "config": "asr",
-                          "split": "train"
-                        },
-                        {
-                          "dataset": "superb",
-                          "config": "asr",
-                          "split": "validation"
-                        },
-                        {
-                          "dataset": "superb",
-                          "config": "asr",
-                          "split": "test"
-                        },
-                        {
-                          "dataset": "superb",
-                          "config": "ic",
-                          "split": "train"
-                        },
-                        {
-                          "dataset": "superb",
-                          "config": "ic",
-                          "split": "validation"
-                        },
-                        {
-                          "dataset": "superb",
-                          "config": "ic",
-                          "split": "test"
-                        },
-                        {
-                          "dataset": "superb",
-                          "config": "ks",
-                          "split": "train"
-                        },
-                        {
-                          "dataset": "superb",
-                          "config": "ks",
-                          "split": "validation"
-                        },
-                        {
-                          "dataset": "superb",
-                          "config": "ks",
-                          "split": "test"
-                        },
-                        {
-                          "dataset": "superb",
-                          "config": "sd",
-                          "split": "train"
-                        },
-                        { "dataset": "superb", "config": "sd", "split": "dev" },
-                        {
-                          "dataset": "superb",
-                          "config": "sd",
-                          "split": "test"
-                        },
-                        {
-                          "dataset": "superb",
-                          "config": "si",
-                          "split": "train"
-                        },
-                        {
-                          "dataset": "superb",
-                          "config": "si",
-                          "split": "validation"
-                        },
-                        { "dataset": "superb", "config": "si", "split": "test" }
-                      ],
-                      "pending": [],
-                      "failed": [
-                        {
-                          "dataset": "superb",
-                          "config": "er",
-                          "error": {
-                            "error": "dataset=superb requires manual download.",
-                            "cause_exception": "ManualDownloadError",
-                            "cause_message": "                    The dataset superb with config er requires manual data.\n                    Please follow the manual download instructions:\n\nPlease download the IEMOCAP dataset after submitting the request form here:\nhttps://sail.usc.edu/iemocap/iemocap_release.htm\nHaving downloaded the dataset you can extract it with `tar -xvzf IEMOCAP_full_release.tar.gz`\nwhich should create a folder called `IEMOCAP_full_release`\n\n                    Manual data can be loaded with:\n                     datasets.load_dataset(\"superb\", data_dir=\"<path/to/manual/data>\")",
-                            "cause_traceback": [
-                              "Traceback (most recent call last):\n",
-                              "  File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 299, in raise_if_requires_manual_download\n    builder._check_manual_download(\n",
-                              "  File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/builder.py\", line 932, in _check_manual_download\n    raise ManualDownloadError(\n",
-                              "datasets.builder.ManualDownloadError:                     The dataset superb with config er requires manual data.\n                    Please follow the manual download instructions:\n\nPlease download the IEMOCAP dataset after submitting the request form here:\nhttps://sail.usc.edu/iemocap/iemocap_release.htm\nHaving downloaded the dataset you can extract it with `tar -xvzf IEMOCAP_full_release.tar.gz`\nwhich should create a folder called `IEMOCAP_full_release`\n\n                    Manual data can be loaded with:\n                     datasets.load_dataset(\"superb\", data_dir=\"<path/to/manual/data>\")\n"
-                            ]
-                          }
-                        }
-                      ]
-                    }
                   }
                 }
               }
diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
index 934d33c75b..b0b00e042d 100644
--- a/libs/libcommon/src/libcommon/exceptions.py
+++ b/libs/libcommon/src/libcommon/exceptions.py
@@ -81,7 +81,6 @@ def as_response(self) -> ErrorResponse:
     "DatasetGenerationError",
     "DatasetGenerationCastError",
     "DatasetInBlockListError",
-    "DatasetManualDownloadError",
     "DatasetModuleNotInstalledError",
     "DatasetNotFoundError",
     "DatasetScriptError",
@@ -200,13 +199,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetGenerationCastError", cause, True)
 
 
-class DatasetManualDownloadError(CacheableError):
-    """The dataset requires manual download."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetManualDownloadError", cause, True)
-
-
 class DatasetModuleNotInstalledError(CacheableError):
     """The dataset tries to import a module that is not installed."""
 
diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
index 6d3b5748e3..49e4dd8a26 100644
--- a/services/worker/src/worker/job_runners/config/parquet_and_info.py
+++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -26,7 +26,7 @@
 import requests
 from datasets import DownloadConfig, Features, load_dataset_builder
 from datasets.arrow_writer import ParquetWriter
-from datasets.builder import DatasetBuilder, ManualDownloadError
+from datasets.builder import DatasetBuilder
 from datasets.data_files import EmptyDatasetError as _EmptyDatasetError
 from datasets.download import StreamingDownloadManager
 from datasets.packaged_modules.parquet.parquet import Parquet as ParquetBuilder
@@ -66,7 +66,6 @@
     CreateCommitError,
     DatasetGenerationCastError,
     DatasetGenerationError,
-    DatasetManualDownloadError,
     DatasetNotFoundError,
     DatasetWithScriptNotSupportedError,
     EmptyDatasetError,
@@ -263,41 +262,6 @@ def _is_too_big_from_datasets(
     return bool(dataset_size > max_dataset_size_bytes)
 
 
-def raise_if_requires_manual_download(
-    builder: DatasetBuilder,
-    hf_endpoint: str,
-    hf_token: Optional[str],
-) -> None:
-    """
-    Raise an error if the dataset requires manual download.
-
-    Args:
-        builder (`datasets.builder.DatasetBuilder`):
-            A dataset builder instance to check.
-        hf_endpoint (`str`):
-            The Hub endpoint (for example: "https://huggingface.co").
-        hf_token (`str`, *optional*):
-            An app authentication token with read access to all the datasets.
-
-    Raises:
-        [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError):
-            If the datasets.config.HF_ENDPOINT is not set to the expected value.
-        [~`libcommon.exceptions.DatasetManualDownloadError`]:
-            If the dataset requires manual download.
-    """
-    if datasets.config.HF_ENDPOINT != hf_endpoint:
-        raise ValueError(
-            f"Invalid datasets.config.HF_ENDPOINT value: '{datasets.config.HF_ENDPOINT}'. Please set it to:"
-            f" '{hf_endpoint}'."
-        )
-    try:
-        builder._check_manual_download(
-            StreamingDownloadManager(base_path=builder.base_path, download_config=DownloadConfig(token=hf_token))
-        )
-    except ManualDownloadError as err:
-        raise DatasetManualDownloadError(f"dataset={builder.repo_id} requires manual download.", cause=err) from err
-
-
 def is_dataset_too_big(
     dataset_info: DatasetInfo,
     builder: DatasetBuilder,
@@ -1442,8 +1406,6 @@ def compute_config_parquet_and_info_response(
             If the previous step gave an error.
         [~`libcommon.exceptions.CreateCommitError`]:
           If one of the commits could not be created on the Hub.
-        [~`libcommon.exceptions.DatasetManualDownloadError`]:
-          If the dataset requires manual download.
         [~`libcommon.exceptions.EmptyDatasetError`]:
           The dataset is empty.
         [~`libcommon.exceptions.ConfigNamesError`]:
@@ -1551,11 +1513,6 @@ def compute_config_parquet_and_info_response(
                     writer_batch_size=writer_batch_size,
                 )
         else:
-            raise_if_requires_manual_download(
-                builder=builder,
-                hf_endpoint=hf_endpoint,
-                hf_token=hf_token,
-            )
             dataset_info = hf_api.dataset_info(repo_id=dataset, revision=source_revision, files_metadata=True)
             if is_dataset_too_big(
                 dataset_info=dataset_info,
diff --git a/services/worker/src/worker/job_runners/config/split_names.py b/services/worker/src/worker/job_runners/config/split_names.py
index 60a79000ab..454633da37 100644
--- a/services/worker/src/worker/job_runners/config/split_names.py
+++ b/services/worker/src/worker/job_runners/config/split_names.py
@@ -5,11 +5,9 @@
 from typing import Optional
 
 from datasets import get_dataset_split_names
-from datasets.builder import ManualDownloadError
 from datasets.data_files import EmptyDatasetError as _EmptyDatasetError
 from libcommon.dtos import FullSplitItem
 from libcommon.exceptions import (
-    DatasetManualDownloadError,
     DatasetWithScriptNotSupportedError,
     DatasetWithTooManySplitsError,
     EmptyDatasetError,
@@ -52,8 +50,6 @@ def compute_split_names_from_streaming_response(
             Maximum number of splits.
 
     Raises:
-        [~`libcommon.exceptions.DatasetManualDownloadError`]:
-          If the dataset requires manual download.
         [~`libcommon.exceptions.EmptyDatasetError`]:
           The dataset is empty.
         [~`libcommon.exceptions.SplitsNamesError`]:
@@ -79,8 +75,6 @@ def compute_split_names_from_streaming_response(
                 trust_remote_code=resolve_trust_remote_code(dataset=dataset, allow_list=dataset_scripts_allow_list),
             )
         ]
-    except ManualDownloadError as err:
-        raise DatasetManualDownloadError(f"{dataset=} requires manual download.", cause=err) from err
     except _EmptyDatasetError as err:
         raise EmptyDatasetError("The dataset is empty.", cause=err) from err
     except Exception as err:
diff --git a/services/worker/tests/job_runners/config/test_parquet_and_info.py b/services/worker/tests/job_runners/config/test_parquet_and_info.py
index 52d3a52635..b4c856f30a 100644
--- a/services/worker/tests/job_runners/config/test_parquet_and_info.py
+++ b/services/worker/tests/job_runners/config/test_parquet_and_info.py
@@ -28,9 +28,6 @@
 from datasets.utils.py_utils import asdict
 from huggingface_hub.hf_api import CommitOperationAdd, HfApi
 from libcommon.dtos import JobInfo, JobParams, Priority
-from libcommon.exceptions import (
-    DatasetManualDownloadError,
-)
 from libcommon.queue.jobs import Queue
 from libcommon.resources import CacheMongoResource, QueueMongoResource
 from libcommon.simple_cache import upsert_response
@@ -55,7 +52,6 @@
     limit_parquet_writes,
     list_generated_parquet_files,
     parse_repo_filename,
-    raise_if_requires_manual_download,
     stream_convert_to_parquet,
     track_reads,
 )
@@ -228,16 +224,6 @@ def test_compute_legacy_configs(
     assert updated_repo_configs == {"first"}
 
 
-def test_raise_if_requires_manual_download(hub_public_manual_download: str, app_config: AppConfig) -> None:
-    builder = load_dataset_builder(hub_public_manual_download)
-    with pytest.raises(DatasetManualDownloadError):
-        raise_if_requires_manual_download(
-            builder=builder,
-            hf_endpoint=app_config.common.hf_endpoint,
-            hf_token=app_config.common.hf_token,
-        )
-
-
 @pytest.mark.parametrize(
     "name,expected",
     [("public", False), ("big", True)],
diff --git a/services/worker/tests/job_runners/config/test_split_names.py b/services/worker/tests/job_runners/config/test_split_names.py
index 493df6b132..3a7c79ca18 100644
--- a/services/worker/tests/job_runners/config/test_split_names.py
+++ b/services/worker/tests/job_runners/config/test_split_names.py
@@ -8,11 +8,7 @@
 
 import pytest
 from libcommon.dtos import Priority
-from libcommon.exceptions import (
-    CustomError,
-    DatasetManualDownloadError,
-    PreviousStepFormatError,
-)
+from libcommon.exceptions import CustomError, PreviousStepFormatError
 from libcommon.resources import CacheMongoResource, QueueMongoResource
 from libcommon.simple_cache import (
     CachedArtifactError,
@@ -24,7 +20,6 @@
 from worker.job_runners.config.split_names import (
     ConfigSplitNamesJobRunner,
     compute_split_names_from_info_response,
-    compute_split_names_from_streaming_response,
 )
 from worker.resources import LibrariesResource
 
@@ -231,19 +226,6 @@ def test_compute_split_names_from_streaming_response(
         assert response_dict["cause_traceback"][0] == "Traceback (most recent call last):\n"
 
 
-def test_compute_split_names_from_streaming_response_raises(
-    hub_public_manual_download: str, app_config: AppConfig
-) -> None:
-    with pytest.raises(DatasetManualDownloadError):
-        compute_split_names_from_streaming_response(
-            hub_public_manual_download,
-            "default",
-            max_number=999,
-            hf_token=app_config.common.hf_token,
-            dataset_scripts_allow_list=[hub_public_manual_download],
-        )
-
-
 def test_compute(app_config: AppConfig, get_job_runner: GetJobRunner, hub_public_csv: str) -> None:
     dataset = hub_public_csv
     config, _ = get_default_config_split()