huggingface · lhoestq · Oct 19, 2023 · Oct 19, 2023 · Oct 19, 2023 · Oct 19, 2023
diff --git a/chart/templates/_env/_envCommon.tpl b/chart/templates/_env/_envCommon.tpl
@@ -4,6 +4,8 @@
 {{- define "envCommon" -}}
 - name: COMMON_BLOCKED_DATASETS
   value: {{ .Values.common.blockedDatasets | quote}}
+- name: COMMON_DATASET_SCRIPTS_ALLOW_LIST
+  value: {{ .Values.common.datasetScriptsAllowList | quote}}
 - name: COMMON_HF_ENDPOINT
   value: {{ include "datasetsServer.hub.url" . }}
 - name: HF_ENDPOINT # see https://github.com/huggingface/datasets/pull/5196#issuecomment-1322191411

diff --git a/chart/values.yaml b/chart/values.yaml
@@ -78,9 +78,13 @@ images:
       tag: sha-fb3399a
 
 common:
-  # comma-separated list of blocked datasets (e.g. if not supported by the Hub). No jobs will be processed for those datasets.
+  # Comma-separated list of blocked datasets (e.g. if not supported by the Hub). No jobs will be processed for those datasets.
   # See https://observablehq.com/@huggingface/blocked-datasets
   blockedDatasets: "Graphcore/gqa,Graphcore/gqa-lxmert,Graphcore/vqa,Graphcore/vqa-lxmert,echarlaix/gqa-lxmert,echarlaix/vqa,echarlaix/vqa-lxmert,KakologArchives/KakologArchives,open-llm-leaderboard/*"
+  # Comma-separated list of the datasets for which we support dataset scripts.
+  # Unix shell-style wildcards also work in the dataset name for namespaced datasets, for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace.
+  # The keyword `{{ALL_DATASETS_WITH_NO_NAMESPACE}}` refers to all the datasets without namespace.
+  datasetScriptsAllowList: "{{ALL_DATASETS_WITH_NO_NAMESPACE}}"
   # URL of the HuggingFace Hub
   hfEndpoint: ""
 

diff --git a/libs/libcommon/README.md b/libs/libcommon/README.md
@@ -14,6 +14,7 @@ Set the assets (images and audio files stored locally) environment variables to
 Set the common environment variables to configure the following aspects:
 
 - `COMMON_BLOCKED_DATASETS`: comma-separated list of the blocked datasets. Unix shell-style wildcards also work in the dataset name for namespaced datasets, for example `some_namespace/*` to block all the datasets in the `some_namespace` namespace. If empty, no dataset is blocked. Defaults to empty.
+- `COMMON_DATASET_SCRIPTS_ALLOW_LIST`: comma-separated list of the datasets for which we support dataset scripts. Unix shell-style wildcards also work in the dataset name for namespaced datasets, for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace. The keyword `{{ALL_DATASETS_WITH_NO_NAMESPACE}}` refers to all the datasets without namespace. If empty, no dataset with script is supported. Defaults to empty.
 - `COMMON_HF_ENDPOINT`: URL of the HuggingFace Hub. Defaults to `https://huggingface.co`.
 - `COMMON_HF_TOKEN`: App Access Token (ask moonlanding administrators to get one, only the `read` role is required) to access the gated datasets. Defaults to empty.
 

diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py
@@ -162,13 +162,15 @@ def from_env(cls) -> "RowsIndexConfig":
 
 
 COMMON_BLOCKED_DATASETS: list[str] = []
+COMMON_DATASET_SCRIPTS_ALLOW_LIST: list[str] = []
 COMMON_HF_ENDPOINT = "https://huggingface.co"
 COMMON_HF_TOKEN = None
 
 
 @dataclass(frozen=True)
 class CommonConfig:
     blocked_datasets: list[str] = field(default_factory=COMMON_BLOCKED_DATASETS.copy)
+    dataset_scripts_allow_list: list[str] = field(default_factory=COMMON_DATASET_SCRIPTS_ALLOW_LIST.copy)
     hf_endpoint: str = COMMON_HF_ENDPOINT
     hf_token: Optional[str] = COMMON_HF_TOKEN
 
@@ -178,6 +180,9 @@ def from_env(cls) -> "CommonConfig":
         with env.prefixed("COMMON_"):
             return cls(
                 blocked_datasets=env.list(name="BLOCKED_DATASETS", default=COMMON_BLOCKED_DATASETS.copy()),
+                dataset_scripts_allow_list=env.list(
+                    name="DATASET_SCRIPTS_ALLOW_LIST", default=COMMON_DATASET_SCRIPTS_ALLOW_LIST.copy()
+                ),
                 hf_endpoint=env.str(name="HF_ENDPOINT", default=COMMON_HF_ENDPOINT),
                 hf_token=env.str(name="HF_TOKEN", default=COMMON_HF_TOKEN),  # nosec
             )

diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py
@@ -86,6 +86,7 @@ def as_response(self) -> ErrorResponse:
     "DatasetRevisionEmptyError",
     "DatasetRevisionNotFoundError",
     "DatasetScriptError",
+    "DatasetWithScriptNotSupportedError",
     "DatasetWithTooManyConfigsError",
     "DatasetWithTooManyParquetFilesError",
     "DisabledViewerError",
@@ -547,3 +548,10 @@ class UnsupportedExternalFilesError(CacheableError):
 
     def __init__(self, message: str, cause: Optional[BaseException] = None):
         super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedExternalFilesError", cause, True)
+
+
+class DatasetWithScriptNotSupportedError(CacheableError):
+    """We don't support some datasets because they have a dataset script."""
+
+    def __init__(self, message: str, cause: Optional[BaseException] = None):
+        super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithScriptNotSupportedError", cause, True)
diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -80,6 +80,7 @@
     HF_HUB_HTTP_ERROR_RETRY_SLEEPS,
     LOCK_GIT_BRANCH_RETRY_SLEEPS,
     create_branch,
+    disable_dataset_scripts_support,
     hf_hub_url,
     retry,
 )
@@ -1088,6 +1089,7 @@ def compute_config_parquet_and_info_response(
     max_external_data_files: int,
     max_row_group_byte_size_for_copy: int,
     no_max_size_limit_datasets: list[str],
+    dataset_scripts_allow_list: list[str],
 ) -> ConfigParquetAndInfoResponse:
     """
     Get the response of config-parquet-and-info for one specific dataset and config on huggingface.co.
@@ -1124,6 +1126,11 @@ def compute_config_parquet_and_info_response(
             The maximum size in bytes of parquet files that are allowed to be copied without being converted.
         no_max_size_limit_datasets (`list[str]`):
             List of datasets that should be fully converted (no partial conversion).
+        dataset_scripts_allow_list (`list[str]`):
+            List of datasets for which we support dataset scripts.
+            Unix shell-style wildcards also work in the dataset name for namespaced datasets,
+            for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace.
+            The keyword `{{ALL_DATASETS_WITH_NO_NAMESPACE}}` refers to all the datasets without namespace.
     Returns:
         `ConfigParquetAndInfoResponse`: An object with the config_parquet_and_info_response
           (dataset info and list of parquet files).
@@ -1164,6 +1171,8 @@ def compute_config_parquet_and_info_response(
             If we failed to get the external files sizes to make sure we can convert the dataset to parquet
         - [`libcommon.exceptions.ExternalFilesSizeRequestError`]
             If we failed to get the external files sizes to make sure we can convert the dataset to parquet
+        - [`libcommon.exceptions.DatasetWithScriptNotSupportedError`]
+            If the dataset has a dataset script and is not in the allow list.
         - [`libcommon.exceptions.PreviousStepFormatError`]
             If the content of the previous step has not the expected format
         - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
@@ -1194,13 +1203,14 @@ def compute_config_parquet_and_info_response(
 
     download_config = DownloadConfig(delete_extracted=True)
     try:
-        builder = load_dataset_builder(
-            path=dataset,
-            name=config,
-            revision=source_revision,
-            token=hf_token,
-            download_config=download_config,
-        )
+        with disable_dataset_scripts_support(allow_list=dataset_scripts_allow_list):
+            builder = load_dataset_builder(
+                path=dataset,
+                name=config,
+                revision=source_revision,
+                token=hf_token,
+                download_config=download_config,
+            )
     except _EmptyDatasetError as err:
         raise EmptyDatasetError(f"{dataset=} is empty.", cause=err) from err
     except FileNotFoundError as err:
@@ -1350,5 +1360,6 @@ def compute(self) -> CompleteJobResult:
                 max_external_data_files=self.parquet_and_info_config.max_external_data_files,
                 max_row_group_byte_size_for_copy=self.parquet_and_info_config.max_row_group_byte_size_for_copy,
                 no_max_size_limit_datasets=self.parquet_and_info_config.no_max_size_limit_datasets,
+                dataset_scripts_allow_list=self.app_config.common.dataset_scripts_allow_list,
             )
         )
diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_streaming.py b/services/worker/src/worker/job_runners/split/first_rows_from_streaming.py
@@ -34,7 +34,12 @@
 from worker.config import AppConfig, FirstRowsConfig
 from worker.dtos import CompleteJobResult, JobRunnerInfo, SplitFirstRowsResponse
 from worker.job_runners.split.split_job_runner import SplitJobRunnerWithDatasetsCache
-from worker.utils import create_truncated_row_items, get_json_size, get_rows_or_raise
+from worker.utils import (
+    create_truncated_row_items,
+    disable_dataset_scripts_support,
+    get_json_size,
+    get_rows_or_raise,
+)
 
 
 def transform_rows(
@@ -74,6 +79,7 @@ def compute_first_rows_response(
     rows_max_number: int,
     rows_min_number: int,
     columns_max_number: int,
+    dataset_scripts_allow_list: list[str],
     max_size_fallback: Optional[int] = None,
 ) -> SplitFirstRowsResponse:
     """
@@ -107,6 +113,11 @@ def compute_first_rows_response(
             The minimum number of rows of the response.
         columns_max_number (`int`):
             The maximum number of columns supported.
+        dataset_scripts_allow_list (`list[str]`):
+            List of datasets for which we support dataset scripts.
+            Unix shell-style wildcards also work in the dataset name for namespaced datasets,
+            for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace.
+            The keyword `{{ALL_DATASETS_WITH_NO_NAMESPACE}}` refers to all the datasets without namespace.
     Returns:
         [`SplitFirstRowsResponse`]: The list of first rows of the split.
     Raises the following errors:
@@ -130,6 +141,8 @@ def compute_first_rows_response(
           If the split rows could not be obtained using the datasets library in streaming mode.
         - [`libcommon.exceptions.NormalRowsError`]
           If the split rows could not be obtained using the datasets library in normal mode.
+        - [`libcommon.exceptions.DatasetWithScriptNotSupportedError`]
+            If the dataset has a dataset script and is not in the allow list.
     """
     logging.info(f"get first-rows for dataset={dataset} config={config} split={split}")
     # get the features
@@ -147,13 +160,14 @@ def compute_first_rows_response(
     if not info.features:
         try:
             # https://github.com/huggingface/datasets/blob/f5826eff9b06ab10dba1adfa52543341ef1e6009/src/datasets/iterable_dataset.py#L1255
-            iterable_dataset = load_dataset(
-                path=dataset,
-                name=config,
-                split=split,
-                streaming=True,
-                token=hf_token,
-            )
+            with disable_dataset_scripts_support(dataset_scripts_allow_list):
+                iterable_dataset = load_dataset(
+                    path=dataset,
+                    name=config,
+                    split=split,
+                    streaming=True,
+                    token=hf_token,
+                )
             if not isinstance(iterable_dataset, IterableDataset):
                 raise TypeError("load_dataset should return an IterableDataset.")
             iterable_dataset = iterable_dataset._resolve_features()
@@ -197,15 +211,16 @@ def compute_first_rows_response(
         )
 
     # get the rows
-    rows_content = get_rows_or_raise(
-        dataset=dataset,
-        config=config,
-        split=split,
-        info=info,
-        max_size_fallback=max_size_fallback,
-        rows_max_number=rows_max_number,
-        token=hf_token,
-    )
+    with disable_dataset_scripts_support(dataset_scripts_allow_list):
+        rows_content = get_rows_or_raise(
+            dataset=dataset,
+            config=config,
+            split=split,
+            info=info,
+            max_size_fallback=max_size_fallback,
+            rows_max_number=rows_max_number,
+            token=hf_token,
+        )
     rows = rows_content["rows"]
     all_fetched = rows_content["all_fetched"]
 
@@ -301,5 +316,6 @@ def compute(self) -> CompleteJobResult:
                 rows_max_number=self.first_rows_config.max_number,
                 rows_min_number=self.first_rows_config.min_number,
                 columns_max_number=self.first_rows_config.columns_max_number,
+                dataset_scripts_allow_list=self.app_config.common.dataset_scripts_allow_list,
             )
         )
diff --git a/services/worker/src/worker/job_runners/split/opt_in_out_urls_scan_from_streaming.py b/services/worker/src/worker/job_runners/split/opt_in_out_urls_scan_from_streaming.py
@@ -24,7 +24,7 @@
 from worker.config import AppConfig, OptInOutUrlsScanConfig
 from worker.dtos import CompleteJobResult, OptInOutUrlsScanResponse, OptUrl
 from worker.job_runners.split.split_job_runner import SplitJobRunnerWithDatasetsCache
-from worker.utils import get_rows_or_raise
+from worker.utils import disable_dataset_scripts_support, get_rows_or_raise
 
 
 async def check_spawning(
@@ -102,6 +102,7 @@ def compute_opt_in_out_urls_scan_response(
     max_concurrent_requests_number: int,
     max_requests_per_second: int,
     spawning_url: str,
+    dataset_scripts_allow_list: list[str],
 ) -> OptInOutUrlsScanResponse:
     """
     Get the response of split-opt-in-out-urls-scan cache for a specific split of a dataset from huggingface.co.
@@ -132,6 +133,11 @@ def compute_opt_in_out_urls_scan_response(
             The maximum number of requests to be processed by second.
         spawning_url (`str`):
             Spawgning API URL
+        dataset_scripts_allow_list (`list[str]`):
+            List of datasets for which we support dataset scripts.
+            Unix shell-style wildcards also work in the dataset name for namespaced datasets,
+            for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace.
+            The keyword `{{ALL_DATASETS_WITH_NO_NAMESPACE}}` refers to all the datasets without namespace.
 
     Returns:
         [`OptInOutUrlsScanResponse`]
@@ -148,6 +154,8 @@ def compute_opt_in_out_urls_scan_response(
           If the split rows could not be obtained using the datasets library in streaming mode.
         - [`libcommon.exceptions.NormalRowsError`]
           If the split rows could not be obtained using the datasets library in normal mode.
+        - [`libcommon.exceptions.DatasetWithScriptNotSupportedError`]
+            If the dataset has a dataset script and is not in the allow list.
     """
     logging.info(f"get opt-in-out-urls-scan for dataset={dataset} config={config} split={split}")
 
@@ -200,15 +208,16 @@ def compute_opt_in_out_urls_scan_response(
         )
 
     # get the rows
-    rows_content = get_rows_or_raise(
-        dataset=dataset,
-        config=config,
-        split=split,
-        info=info,
-        rows_max_number=rows_max_number,
-        token=hf_token,
-        column_names=image_url_columns,
-    )
+    with disable_dataset_scripts_support(dataset_scripts_allow_list):
+        rows_content = get_rows_or_raise(
+            dataset=dataset,
+            config=config,
+            split=split,
+            info=info,
+            rows_max_number=rows_max_number,
+            token=hf_token,
+            column_names=image_url_columns,
+        )
     rows = rows_content["rows"]
 
     # get the urls
@@ -300,5 +309,6 @@ def compute(self) -> CompleteJobResult:
                 max_concurrent_requests_number=self.urls_scan_config.max_concurrent_requests_number,
                 max_requests_per_second=self.urls_scan_config.max_requests_per_second,
                 spawning_url=self.urls_scan_config.spawning_url,
+                dataset_scripts_allow_list=self.app_config.common.dataset_scripts_allow_list,
             )
         )
diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py
@@ -9,12 +9,17 @@
 import traceback
 import warnings
 from collections.abc import Callable, Sequence
+from contextlib import AbstractContextManager
+from fnmatch import fnmatch
 from typing import Any, Optional, TypeVar, Union, cast
+from unittest.mock import patch
 from urllib.parse import quote
 
+import datasets
 import PIL
 import requests
 from datasets import Dataset, DatasetInfo, DownloadConfig, IterableDataset, load_dataset
+from datasets.load import HubDatasetModuleFactoryWithScript
 from datasets.utils.file_utils import get_authentication_headers_for_url
 from fsspec.implementations.http import HTTPFileSystem
 from huggingface_hub.hf_api import HfApi
@@ -23,6 +28,7 @@
 from libcommon.exceptions import (
     ConfigNotFoundError,
     DatasetNotFoundError,
+    DatasetWithScriptNotSupportedError,
     NormalRowsError,
     PreviousStepFormatError,
     SplitNotFoundError,
@@ -372,3 +378,37 @@ def is_dataset_script_error() -> bool:
     (t, v, tb) = sys.exc_info()
     cause_traceback: list[str] = traceback.format_exception(t, v, tb)
     return any(EXTERNAL_DATASET_SCRIPT_PATTERN in cause for cause in cause_traceback)
+
+
+def disable_dataset_scripts_support(allow_list: list[str]) -> AbstractContextManager[Any]:
+    original_init = HubDatasetModuleFactoryWithScript.__init__
+
+    def raise_unsupported_dataset_with_script_or_init(
+        self: HubDatasetModuleFactoryWithScript,
+        name: str,
+        revision: Optional[Union[str, datasets.Version]] = None,
+        download_config: Optional[DownloadConfig] = None,
+        download_mode: Optional[Union[datasets.DownloadMode, str]] = None,
+        dynamic_modules_path: Optional[str] = None,
+    ) -> None:
+        for allowed_pattern in allow_list:
+            if (allowed_pattern == "{{ALL_DATASETS_WITH_NO_NAMESPACE}}" and "/" not in name) or fnmatch(
+                name, allowed_pattern
+            ):
+                break
+        else:
+            raise DatasetWithScriptNotSupportedError(
+                "The dataset viewer doesn't support this dataset because it runs "
+                "arbitrary python code. Please open a discussion in the discussion tab "
+                "if you think this is an error and tag @lhoestq and @severo."
+            )
+        original_init(
+            self=self,
+            name=name,
+            revision=revision,
+            download_config=download_config,
+            download_mode=download_mode,
+            dynamic_modules_path=dynamic_modules_path,
+        )
+
+    return patch.object(HubDatasetModuleFactoryWithScript, "__init__", raise_unsupported_dataset_with_script_or_init)
diff --git a/services/worker/tests/conftest.py b/services/worker/tests/conftest.py
@@ -62,6 +62,19 @@ def monkeypatch_session() -> Iterator[MonkeyPatch]:
     mp.undo()
 
 
+@fixture()
+def use_hub_prod_endpoint() -> Iterator[MonkeyPatch]:
+    mp = MonkeyPatch()
+    mp.setattr(
+        "huggingface_hub.file_download.HUGGINGFACE_CO_URL_TEMPLATE",
+        "https://huggingface.co/{repo_id}/resolve/{revision}/{filename}",
+    )
+    # ^ see https://github.com/huggingface/datasets/pull/5196#issuecomment-1322191056
+    mp.setattr("datasets.config.HF_ENDPOINT", "https://huggingface.co")
+    yield mp
+    mp.undo()
+
+
 # see https://github.com/pytest-dev/pytest/issues/363#issuecomment-406536200
 @fixture
 def set_env_vars(