huggingface · severo · Aug 26, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/chart/templates/_env/_envCommon.tpl b/chart/templates/_env/_envCommon.tpl
@@ -4,8 +4,6 @@
 {{- define "envCommon" -}}
 - name: COMMON_BLOCKED_DATASETS
   value: {{ .Values.common.blockedDatasets | quote}}
-- name: COMMON_DATASET_SCRIPTS_ALLOW_LIST
-  value: {{ .Values.common.datasetScriptsAllowList | quote}}
 - name: COMMON_HF_ENDPOINT
   value: {{ include "datasetsServer.hub.url" . }}
 - name: HF_ENDPOINT # see https://github.com/huggingface/datasets/pull/5196#issuecomment-1322191411

diff --git a/chart/templates/_env/_envWorker.tpl b/chart/templates/_env/_envWorker.tpl
@@ -47,8 +47,6 @@
   {{- end }}
 - name: PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES
   value: {{ .Values.parquetAndInfo.maxDatasetSizeBytes | quote }}
-- name: PARQUET_AND_INFO_MAX_EXTERNAL_DATA_FILES
-  value: {{ .Values.parquetAndInfo.maxExternalDataFiles | quote }}
 - name: PARQUET_AND_INFO_MAX_ROW_GROUP_BYTE_SIZE_FOR_COPY
   value: {{ .Values.parquetAndInfo.maxRowGroupByteSizeForCopy | quote }}
 - name: PARQUET_AND_INFO_SOURCE_REVISION

diff --git a/chart/values.yaml b/chart/values.yaml
@@ -80,9 +80,6 @@ images:
 common:
   # Comma-separated list of blocked datasets. No jobs will be processed for those datasets.
   blockedDatasets: "open-llm-leaderboard-old/details_*,lunaluan/*,atom-in-the-universe/*,cot-leaderboard/cot-eval-traces,mitermix/yt-links,mcding-org/*"
-  # Comma-separated list of the datasets for which we support dataset scripts.
-  # Unix shell-style wildcards also work in the dataset name for namespaced datasets, for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace.
-  datasetScriptsAllowList: "hf-internal-testing/dataset_with_script,togethercomputer/RedPajama-Data-1T,togethercomputer/RedPajama-Data-V2,gaia-benchmark/GAIA,poloclub/diffusiondb,mozilla-foundation/common_voice_*,google/fleurs,speechcolab/gigaspeech,espnet/yodas"
   # URL of the HuggingFace Hub
   hfEndpoint: ""
 

diff --git a/docs/source/openapi.json b/docs/source/openapi.json
@@ -128,10 +128,7 @@
             {
               "$ref": "#/components/schemas/X-Error-Code-ResponseNotReadyError"
             },
-            { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" },
-            {
-              "$ref": "#/components/schemas/X-Error-Code-ExternalFilesSizeRequestHTTPError"
-            }
+            { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" }
           ]
         },
         "required": false
@@ -1311,11 +1308,6 @@
         "const": "ExternalAuthenticatedError",
         "description": "Raised when the external authentication check failed while the user was authenticated. Even if the external authentication server returns 403 in that case, we return 404 because we don't know if the dataset exist or not. It's also coherent with how the Hugging Face Hub works."
       },
-      "X-Error-Code-ExternalFilesSizeRequestHTTPError": {
-        "type": "string",
-        "const": "ExternalFilesSizeRequestHTTPError",
-        "description": "We failed to get the size of the external files."
-      },
       "X-Error-Code-ExternalUnauthenticatedError": {
         "type": "string",
         "const": "ExternalUnauthenticatedError",

diff --git a/libs/libcommon/README.md b/libs/libcommon/README.md
@@ -33,7 +33,6 @@ To enable signed URLs on the StorageClient, `CLOUDFRONT_KEY_PAIR_ID` and `CLOUDF
 Set the common environment variables to configure the following aspects:
 
 - `COMMON_BLOCKED_DATASETS`: comma-separated list of the blocked datasets. Unix shell-style wildcards also work in the dataset name for namespaced datasets, for example `some_namespace/*` to block all the datasets in the `some_namespace` namespace. If empty, no dataset is blocked. Defaults to empty.
-- `COMMON_DATASET_SCRIPTS_ALLOW_LIST`: comma-separated list of the datasets for which we support dataset scripts. Unix shell-style wildcards also work in the dataset name for namespaced datasets, for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace. If empty, no dataset with script is supported. Defaults to empty.
 - `COMMON_HF_ENDPOINT`: URL of the HuggingFace Hub. Defaults to `https://huggingface.co`.
 - `COMMON_HF_TOKEN`: App Access Token (ask moonlanding administrators to get one, only the `read` role is required) to access the gated datasets. Defaults to empty.
 

diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py
@@ -145,15 +145,13 @@ def from_env(cls) -> "RowsIndexConfig":
 
 
 COMMON_BLOCKED_DATASETS: list[str] = []
-COMMON_DATASET_SCRIPTS_ALLOW_LIST: list[str] = []
 COMMON_HF_ENDPOINT = "https://huggingface.co"
 COMMON_HF_TOKEN = None
 
 
 @dataclass(frozen=True)
 class CommonConfig:
     blocked_datasets: list[str] = field(default_factory=COMMON_BLOCKED_DATASETS.copy)
-    dataset_scripts_allow_list: list[str] = field(default_factory=COMMON_DATASET_SCRIPTS_ALLOW_LIST.copy)
     hf_endpoint: str = COMMON_HF_ENDPOINT
     hf_token: Optional[str] = COMMON_HF_TOKEN
 
@@ -163,9 +161,6 @@ def from_env(cls) -> "CommonConfig":
         with env.prefixed("COMMON_"):
             return cls(
                 blocked_datasets=env.list(name="BLOCKED_DATASETS", default=COMMON_BLOCKED_DATASETS.copy()),
-                dataset_scripts_allow_list=env.list(
-                    name="DATASET_SCRIPTS_ALLOW_LIST", default=COMMON_DATASET_SCRIPTS_ALLOW_LIST.copy()
-                ),
                 hf_endpoint=env.str(name="HF_ENDPOINT", default=COMMON_HF_ENDPOINT),
                 hf_token=env.str(name="HF_TOKEN", default=COMMON_HF_TOKEN),  # nosec
             )

diff --git a/libs/libcommon/src/libcommon/constants.py b/libs/libcommon/src/libcommon/constants.py
@@ -42,6 +42,14 @@
 DEFAULT_MAX_FAILED_RUNS = 3
 LARGE_MAX_FAILED_RUNS = 30  # for errors that should not be permanent
 MAX_FAILED_RUNS_PER_ERROR_CODE = {
+    # 20240823 - temporary until all these deprecated errors disappear
+    "UnsupportedExternalFilesError": DEFAULT_MAX_FAILED_RUNS,
+    "ExternalFilesSizeRequestError": DEFAULT_MAX_FAILED_RUNS,
+    "ExternalFilesSizeRequestHTTPError": DEFAULT_MAX_FAILED_RUNS,
+    "ExternalFilesSizeRequestConnectionError": DEFAULT_MAX_FAILED_RUNS,
+    "ExternalFilesSizeRequestTimeoutError": DEFAULT_MAX_FAILED_RUNS,
+    "DatasetModuleNotInstalledError": DEFAULT_MAX_FAILED_RUNS,
+    "DatasetScriptError": DEFAULT_MAX_FAILED_RUNS,
     # default
     "RetryableConfigNamesError": DEFAULT_MAX_FAILED_RUNS,
     "ConnectionError": DEFAULT_MAX_FAILED_RUNS,
@@ -56,8 +64,6 @@
 }
 ERROR_CODES_TO_RETRY = list(MAX_FAILED_RUNS_PER_ERROR_CODE.keys())
 
-EXTERNAL_DATASET_SCRIPT_PATTERN = "datasets_modules/datasets"
-
 # Arrays are not immutable, we have to take care of not modifying them
 # Anyway: in all this file, we allow constant reassignment (no use of Final)
 CONFIG_HAS_VIEWER_KIND = "config-size"

diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
@@ -82,9 +82,7 @@ def as_response(self) -> ErrorResponse:
     "DatasetGenerationError",
     "DatasetGenerationCastError",
     "DatasetInBlockListError",
-    "DatasetModuleNotInstalledError",
     "DatasetNotFoundError",
-    "DatasetScriptError",
     "DatasetWithScriptNotSupportedError",
     "DatasetWithTooComplexDataFilesPatternsError",
     "DatasetWithTooManyConfigsError",
@@ -93,10 +91,6 @@ def as_response(self) -> ErrorResponse:
     "DiskError",
     "DuckDBIndexFileNotFoundError",
     "EmptyDatasetError",
-    "ExternalFilesSizeRequestConnectionError",
-    "ExternalFilesSizeRequestError",
-    "ExternalFilesSizeRequestHTTPError",
-    "ExternalFilesSizeRequestTimeoutError",
     "ExternalServerError",
     "FeaturesError",
     "FeaturesResponseEmptyError",
@@ -134,7 +128,6 @@ def as_response(self) -> ErrorResponse:
     "TooLongColumnNameError",
     "TooManyColumnsError",
     "UnexpectedError",
-    "UnsupportedExternalFilesError",
 ]
 
 
@@ -209,13 +202,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetGenerationCastError", cause, True)
 
 
-class DatasetModuleNotInstalledError(CacheableError):
-    """The dataset tries to import a module that is not installed."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetModuleNotInstalledError", cause, True)
-
-
 class DatasetNotFoundError(CacheableError):
     """The dataset does not exist."""
 
@@ -229,13 +215,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         )
 
 
-class DatasetScriptError(CacheableError):
-    """The dataset script generated an error."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetScriptError", cause, False)
-
-
 class DatasetWithTooManyConfigsError(CacheableError):
     """The number of configs of a dataset exceeded the limit."""
 
@@ -284,34 +263,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "EmptyDatasetError", cause, True)
 
 
-class ExternalFilesSizeRequestConnectionError(CacheableError):
-    """We failed to get the size of the external files."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "ExternalFilesSizeRequestConnectionError", cause, False)
-
-
-class ExternalFilesSizeRequestError(CacheableError):
-    """We failed to get the size of the external files."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "ExternalFilesSizeRequestError", cause, False)
-
-
-class ExternalFilesSizeRequestHTTPError(CacheableError):
-    """We failed to get the size of the external files."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "ExternalFilesSizeRequestHTTPError", cause, False)
-
-
-class ExternalFilesSizeRequestTimeoutError(CacheableError):
-    """We failed to get the size of the external files."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "ExternalFilesSizeRequestTimeoutError", cause, False)
-
-
 class ExternalServerError(CacheableError):
     """The spawning.ai server is not responding."""
 
@@ -574,13 +525,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
         logging.error(message, exc_info=cause)
 
 
-class UnsupportedExternalFilesError(CacheableError):
-    """We failed to get the size of the external files."""
-
-    def __init__(self, message: str, cause: Optional[BaseException] = None):
-        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedExternalFilesError", cause, False)
-
-
 class DatasetWithScriptNotSupportedError(CacheableError):
     """We don't support some datasets because they have a dataset script."""
 

diff --git a/services/worker/README.md b/services/worker/README.md
@@ -76,7 +76,6 @@ Set environment variables to configure the `parquet-and-info` worker (`PARQUET_A
 - `PARQUET_AND_INFO_COMMIT_MESSAGE`: the git commit message when the worker uploads the parquet files to the Hub. Defaults to `Update parquet files`.
 - `PARQUET_AND_INFO_COMMITTER_HF_TOKEN`: the HuggingFace token to commit the parquet files to the Hub. The token must be an app token associated with a user that has the right to 1. create the `refs/convert/parquet` branch (see `PARQUET_AND_INFO_TARGET_REVISION`) and 2. push commits to it on any dataset. [Datasets maintainers](https://huggingface.co/datasets-maintainers) members have these rights. The token must have permission to write. If not set, the worker will fail. Defaults to None.
 - `PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES`: the maximum size in bytes of the dataset to pre-compute the parquet files. Bigger datasets, or datasets without that information, are partially streamed to get parquet files up to this value. Defaults to `100_000_000`.
-- `PARQUET_AND_INFO_MAX_EXTERNAL_DATA_FILES`: the maximum number of external files of the datasets. Bigger datasets, or datasets without that information, are partially streamed to get parquet files up to `PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES` bytes. Defaults to `10_000`.
 - `PARQUET_AND_INFO_MAX_ROW_GROUP_BYTE_SIZE_FOR_COPY`: the maximum size in bytes of the row groups of parquet datasets that are copied to the target revision. Bigger datasets, or datasets without that information, are partially streamed to get parquet files up to `PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES` bytes. Defaults to `100_000_000`.
 - `PARQUET_AND_INFO_SOURCE_REVISION`: the git revision of the dataset to use to prepare the parquet files. Defaults to `main`.
 - `PARQUET_AND_INFO_TARGET_REVISION`: the git revision of the dataset where to store the parquet files. Make sure the committer token (`PARQUET_AND_INFO_COMMITTER_HF_TOKEN`) has the permission to write there. Defaults to `refs/convert/parquet`.

diff --git a/services/worker/src/worker/config.py b/services/worker/src/worker/config.py
@@ -209,7 +209,6 @@ def from_env(cls) -> "PresidioEntitiesScanConfig":
 PARQUET_AND_INFO_COMMIT_MESSAGE = "Update parquet files"
 PARQUET_AND_INFO_COMMITTER_HF_TOKEN = None
 PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES = 100_000_000
-PARQUET_AND_INFO_MAX_EXTERNAL_DATA_FILES = 10_000
 PARQUET_AND_INFO_MAX_ROW_GROUP_BYTE_SIZE_FOR_COPY = 100_000_000
 PARQUET_AND_INFO_SOURCE_REVISION = "main"
 PARQUET_AND_INFO_TARGET_REVISION = "refs/convert/parquet"
@@ -221,7 +220,6 @@ class ParquetAndInfoConfig:
     commit_message: str = PARQUET_AND_INFO_COMMIT_MESSAGE
     committer_hf_token: Optional[str] = PARQUET_AND_INFO_COMMITTER_HF_TOKEN
     max_dataset_size_bytes: int = PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES
-    max_external_data_files: int = PARQUET_AND_INFO_MAX_EXTERNAL_DATA_FILES
     max_row_group_byte_size_for_copy: int = PARQUET_AND_INFO_MAX_ROW_GROUP_BYTE_SIZE_FOR_COPY
     source_revision: str = PARQUET_AND_INFO_SOURCE_REVISION
     target_revision: str = PARQUET_AND_INFO_TARGET_REVISION
@@ -237,9 +235,6 @@ def from_env(cls) -> "ParquetAndInfoConfig":
                 max_dataset_size_bytes=env.int(
                     name="MAX_DATASET_SIZE_BYTES", default=PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES
                 ),
-                max_external_data_files=env.int(
-                    name="MAX_EXTERNAL_DATA_FILES", default=PARQUET_AND_INFO_MAX_EXTERNAL_DATA_FILES
-                ),
                 max_row_group_byte_size_for_copy=env.int(
                     name="MAX_ROW_GROUP_BYTE_SIZE_FOR_COPY", default=PARQUET_AND_INFO_MAX_ROW_GROUP_BYTE_SIZE_FOR_COPY
                 ),

diff --git a/services/worker/src/worker/job_manager.py b/services/worker/src/worker/job_manager.py
@@ -10,7 +10,6 @@
 from libcommon.exceptions import (
     CustomError,
     DatasetNotFoundError,
-    DatasetScriptError,
     JobManagerCrashedError,
     JobManagerExceededMaximumDurationError,
     PreviousStepStillProcessingError,
@@ -27,7 +26,6 @@
 
 from worker.config import AppConfig, WorkerConfig
 from worker.job_runner import JobRunner
-from worker.utils import is_dataset_script_error
 
 
 class JobManager:
@@ -187,13 +185,7 @@ def process(
                 "duration": get_duration_or_none(started_at),
             }
         except Exception as err:
-            e = (
-                err
-                if isinstance(err, CustomError)
-                else DatasetScriptError(str(err), err)
-                if is_dataset_script_error()
-                else UnexpectedError(str(err), err)
-            )
+            e = err if isinstance(err, CustomError) else UnexpectedError(str(err), err)
             self.debug(f"response for job_info={self.job_info} had an error")
             return {
                 "job_info": self.job_info,