Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Disable dataset scripts #2001

Merged
merged 12 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions chart/templates/_env/_envCommon.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
{{- define "envCommon" -}}
- name: COMMON_BLOCKED_DATASETS
value: {{ .Values.common.blockedDatasets | quote}}
- name: COMMON_DATASET_SCRIPTS_ALLOW_LIST
value: {{ .Values.common.datasetScriptsAllowList | quote}}
- name: COMMON_HF_ENDPOINT
value: {{ include "datasetsServer.hub.url" . }}
- name: HF_ENDPOINT # see https://github.com/huggingface/datasets/pull/5196#issuecomment-1322191411
Expand Down
6 changes: 5 additions & 1 deletion chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,13 @@ images:
tag: sha-fb3399a

common:
# comma-separated list of blocked datasets (e.g. if not supported by the Hub). No jobs will be processed for those datasets.
# Comma-separated list of blocked datasets (e.g. if not supported by the Hub). No jobs will be processed for those datasets.
# See https://observablehq.com/@huggingface/blocked-datasets
blockedDatasets: "Graphcore/gqa,Graphcore/gqa-lxmert,Graphcore/vqa,Graphcore/vqa-lxmert,echarlaix/gqa-lxmert,echarlaix/vqa,echarlaix/vqa-lxmert,KakologArchives/KakologArchives,open-llm-leaderboard/*"
# Comma-separated list of the datasets for which we support dataset scripts.
# Unix shell-style wildcards also work in the dataset name for namespaced datasets, for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace.
# The keyword `{{ALL_DATASETS_WITH_NO_NAMESPACE}}` refers to all the datasets without namespace.
datasetScriptsAllowList: "{{ALL_DATASETS_WITH_NO_NAMESPACE}}"
# URL of the HuggingFace Hub
hfEndpoint: ""

Expand Down
1 change: 1 addition & 0 deletions libs/libcommon/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Set the assets (images and audio files stored locally) environment variables to
Set the common environment variables to configure the following aspects:

- `COMMON_BLOCKED_DATASETS`: comma-separated list of the blocked datasets. Unix shell-style wildcards also work in the dataset name for namespaced datasets, for example `some_namespace/*` to block all the datasets in the `some_namespace` namespace. If empty, no dataset is blocked. Defaults to empty.
- `COMMON_DATASET_SCRIPTS_ALLOW_LIST`: comma-separated list of the datasets for which we support dataset scripts. Unix shell-style wildcards also work in the dataset name for namespaced datasets, for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace. The keyword `{{ALL_DATASETS_WITH_NO_NAMESPACE}}` refers to all the datasets without namespace. If empty, no dataset with script is supported. Defaults to empty.
- `COMMON_HF_ENDPOINT`: URL of the HuggingFace Hub. Defaults to `https://huggingface.co`.
- `COMMON_HF_TOKEN`: App Access Token (ask moonlanding administrators to get one, only the `read` role is required) to access the gated datasets. Defaults to empty.

Expand Down
5 changes: 5 additions & 0 deletions libs/libcommon/src/libcommon/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,15 @@ def from_env(cls) -> "RowsIndexConfig":


COMMON_BLOCKED_DATASETS: list[str] = []
COMMON_DATASET_SCRIPTS_ALLOW_LIST: list[str] = []
COMMON_HF_ENDPOINT = "https://huggingface.co"
COMMON_HF_TOKEN = None


@dataclass(frozen=True)
class CommonConfig:
blocked_datasets: list[str] = field(default_factory=COMMON_BLOCKED_DATASETS.copy)
dataset_scripts_allow_list: list[str] = field(default_factory=COMMON_DATASET_SCRIPTS_ALLOW_LIST.copy)
hf_endpoint: str = COMMON_HF_ENDPOINT
hf_token: Optional[str] = COMMON_HF_TOKEN

Expand All @@ -178,6 +180,9 @@ def from_env(cls) -> "CommonConfig":
with env.prefixed("COMMON_"):
return cls(
blocked_datasets=env.list(name="BLOCKED_DATASETS", default=COMMON_BLOCKED_DATASETS.copy()),
dataset_scripts_allow_list=env.list(
name="DATASET_SCRIPTS_ALLOW_LIST", default=COMMON_DATASET_SCRIPTS_ALLOW_LIST.copy()
),
hf_endpoint=env.str(name="HF_ENDPOINT", default=COMMON_HF_ENDPOINT),
hf_token=env.str(name="HF_TOKEN", default=COMMON_HF_TOKEN), # nosec
)
Expand Down
8 changes: 8 additions & 0 deletions libs/libcommon/src/libcommon/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def as_response(self) -> ErrorResponse:
"DatasetRevisionEmptyError",
"DatasetRevisionNotFoundError",
"DatasetScriptError",
"DatasetWithScriptNotSupportedError",
"DatasetWithTooManyConfigsError",
"DatasetWithTooManyParquetFilesError",
"DisabledViewerError",
Expand Down Expand Up @@ -547,3 +548,10 @@ class UnsupportedExternalFilesError(CacheableError):

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedExternalFilesError", cause, True)


class DatasetWithScriptNotSupportedError(CacheableError):
"""We don't support some datasets because they have a dataset script."""

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithScriptNotSupportedError", cause, True)
25 changes: 18 additions & 7 deletions services/worker/src/worker/job_runners/config/parquet_and_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
HF_HUB_HTTP_ERROR_RETRY_SLEEPS,
LOCK_GIT_BRANCH_RETRY_SLEEPS,
create_branch,
disable_dataset_scripts_support,
hf_hub_url,
retry,
)
Expand Down Expand Up @@ -1088,6 +1089,7 @@ def compute_config_parquet_and_info_response(
max_external_data_files: int,
max_row_group_byte_size_for_copy: int,
no_max_size_limit_datasets: list[str],
dataset_scripts_allow_list: list[str],
) -> ConfigParquetAndInfoResponse:
"""
Get the response of config-parquet-and-info for one specific dataset and config on huggingface.co.
Expand Down Expand Up @@ -1124,6 +1126,11 @@ def compute_config_parquet_and_info_response(
The maximum size in bytes of parquet files that are allowed to be copied without being converted.
no_max_size_limit_datasets (`list[str]`):
List of datasets that should be fully converted (no partial conversion).
dataset_scripts_allow_list (`list[str]`):
List of datasets for which we support dataset scripts.
Unix shell-style wildcards also work in the dataset name for namespaced datasets,
for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace.
The keyword `{{ALL_DATASETS_WITH_NO_NAMESPACE}}` refers to all the datasets without namespace.
Returns:
`ConfigParquetAndInfoResponse`: An object with the config_parquet_and_info_response
(dataset info and list of parquet files).
Expand Down Expand Up @@ -1164,6 +1171,8 @@ def compute_config_parquet_and_info_response(
If we failed to get the external files sizes to make sure we can convert the dataset to parquet
- [`libcommon.exceptions.ExternalFilesSizeRequestError`]
If we failed to get the external files sizes to make sure we can convert the dataset to parquet
- [`libcommon.exceptions.DatasetWithScriptNotSupportedError`]
If the dataset has a dataset script and is not in the allow list.
- [`libcommon.exceptions.PreviousStepFormatError`]
If the content of the previous step has not the expected format
- [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
Expand Down Expand Up @@ -1194,13 +1203,14 @@ def compute_config_parquet_and_info_response(

download_config = DownloadConfig(delete_extracted=True)
try:
builder = load_dataset_builder(
path=dataset,
name=config,
revision=source_revision,
token=hf_token,
download_config=download_config,
)
with disable_dataset_scripts_support(allow_list=dataset_scripts_allow_list):
builder = load_dataset_builder(
path=dataset,
name=config,
revision=source_revision,
token=hf_token,
download_config=download_config,
)
except _EmptyDatasetError as err:
raise EmptyDatasetError(f"{dataset=} is empty.", cause=err) from err
except FileNotFoundError as err:
Expand Down Expand Up @@ -1350,5 +1360,6 @@ def compute(self) -> CompleteJobResult:
max_external_data_files=self.parquet_and_info_config.max_external_data_files,
max_row_group_byte_size_for_copy=self.parquet_and_info_config.max_row_group_byte_size_for_copy,
no_max_size_limit_datasets=self.parquet_and_info_config.no_max_size_limit_datasets,
dataset_scripts_allow_list=self.app_config.common.dataset_scripts_allow_list,
)
)
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,12 @@
from worker.config import AppConfig, FirstRowsConfig
from worker.dtos import CompleteJobResult, JobRunnerInfo, SplitFirstRowsResponse
from worker.job_runners.split.split_job_runner import SplitJobRunnerWithDatasetsCache
from worker.utils import create_truncated_row_items, get_json_size, get_rows_or_raise
from worker.utils import (
create_truncated_row_items,
disable_dataset_scripts_support,
get_json_size,
get_rows_or_raise,
)


def transform_rows(
Expand Down Expand Up @@ -74,6 +79,7 @@ def compute_first_rows_response(
rows_max_number: int,
rows_min_number: int,
columns_max_number: int,
dataset_scripts_allow_list: list[str],
max_size_fallback: Optional[int] = None,
) -> SplitFirstRowsResponse:
"""
Expand Down Expand Up @@ -107,6 +113,11 @@ def compute_first_rows_response(
The minimum number of rows of the response.
columns_max_number (`int`):
The maximum number of columns supported.
dataset_scripts_allow_list (`list[str]`):
List of datasets for which we support dataset scripts.
Unix shell-style wildcards also work in the dataset name for namespaced datasets,
for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace.
The keyword `{{ALL_DATASETS_WITH_NO_NAMESPACE}}` refers to all the datasets without namespace.
Returns:
[`SplitFirstRowsResponse`]: The list of first rows of the split.
Raises the following errors:
Expand All @@ -130,6 +141,8 @@ def compute_first_rows_response(
If the split rows could not be obtained using the datasets library in streaming mode.
- [`libcommon.exceptions.NormalRowsError`]
If the split rows could not be obtained using the datasets library in normal mode.
- [`libcommon.exceptions.DatasetWithScriptNotSupportedError`]
If the dataset has a dataset script and is not in the allow list.
"""
logging.info(f"get first-rows for dataset={dataset} config={config} split={split}")
# get the features
Expand All @@ -147,13 +160,14 @@ def compute_first_rows_response(
if not info.features:
try:
# https://github.com/huggingface/datasets/blob/f5826eff9b06ab10dba1adfa52543341ef1e6009/src/datasets/iterable_dataset.py#L1255
iterable_dataset = load_dataset(
path=dataset,
name=config,
split=split,
streaming=True,
token=hf_token,
)
with disable_dataset_scripts_support(dataset_scripts_allow_list):
iterable_dataset = load_dataset(
path=dataset,
name=config,
split=split,
streaming=True,
token=hf_token,
)
if not isinstance(iterable_dataset, IterableDataset):
raise TypeError("load_dataset should return an IterableDataset.")
iterable_dataset = iterable_dataset._resolve_features()
Expand Down Expand Up @@ -197,15 +211,16 @@ def compute_first_rows_response(
)

# get the rows
rows_content = get_rows_or_raise(
dataset=dataset,
config=config,
split=split,
info=info,
max_size_fallback=max_size_fallback,
rows_max_number=rows_max_number,
token=hf_token,
)
with disable_dataset_scripts_support(dataset_scripts_allow_list):
rows_content = get_rows_or_raise(
dataset=dataset,
config=config,
split=split,
info=info,
max_size_fallback=max_size_fallback,
rows_max_number=rows_max_number,
token=hf_token,
)
rows = rows_content["rows"]
all_fetched = rows_content["all_fetched"]

Expand Down Expand Up @@ -301,5 +316,6 @@ def compute(self) -> CompleteJobResult:
rows_max_number=self.first_rows_config.max_number,
rows_min_number=self.first_rows_config.min_number,
columns_max_number=self.first_rows_config.columns_max_number,
dataset_scripts_allow_list=self.app_config.common.dataset_scripts_allow_list,
)
)
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from worker.config import AppConfig, OptInOutUrlsScanConfig
from worker.dtos import CompleteJobResult, OptInOutUrlsScanResponse, OptUrl
from worker.job_runners.split.split_job_runner import SplitJobRunnerWithDatasetsCache
from worker.utils import get_rows_or_raise
from worker.utils import disable_dataset_scripts_support, get_rows_or_raise


async def check_spawning(
Expand Down Expand Up @@ -102,6 +102,7 @@ def compute_opt_in_out_urls_scan_response(
max_concurrent_requests_number: int,
max_requests_per_second: int,
spawning_url: str,
dataset_scripts_allow_list: list[str],
) -> OptInOutUrlsScanResponse:
"""
Get the response of split-opt-in-out-urls-scan cache for a specific split of a dataset from huggingface.co.
Expand Down Expand Up @@ -132,6 +133,11 @@ def compute_opt_in_out_urls_scan_response(
The maximum number of requests to be processed by second.
spawning_url (`str`):
Spawgning API URL
dataset_scripts_allow_list (`list[str]`):
List of datasets for which we support dataset scripts.
Unix shell-style wildcards also work in the dataset name for namespaced datasets,
for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace.
The keyword `{{ALL_DATASETS_WITH_NO_NAMESPACE}}` refers to all the datasets without namespace.

Returns:
[`OptInOutUrlsScanResponse`]
Expand All @@ -148,6 +154,8 @@ def compute_opt_in_out_urls_scan_response(
If the split rows could not be obtained using the datasets library in streaming mode.
- [`libcommon.exceptions.NormalRowsError`]
If the split rows could not be obtained using the datasets library in normal mode.
- [`libcommon.exceptions.DatasetWithScriptNotSupportedError`]
If the dataset has a dataset script and is not in the allow list.
"""
logging.info(f"get opt-in-out-urls-scan for dataset={dataset} config={config} split={split}")

Expand Down Expand Up @@ -200,15 +208,16 @@ def compute_opt_in_out_urls_scan_response(
)

# get the rows
rows_content = get_rows_or_raise(
dataset=dataset,
config=config,
split=split,
info=info,
rows_max_number=rows_max_number,
token=hf_token,
column_names=image_url_columns,
)
with disable_dataset_scripts_support(dataset_scripts_allow_list):
rows_content = get_rows_or_raise(
dataset=dataset,
config=config,
split=split,
info=info,
rows_max_number=rows_max_number,
token=hf_token,
column_names=image_url_columns,
)
rows = rows_content["rows"]

# get the urls
Expand Down Expand Up @@ -300,5 +309,6 @@ def compute(self) -> CompleteJobResult:
max_concurrent_requests_number=self.urls_scan_config.max_concurrent_requests_number,
max_requests_per_second=self.urls_scan_config.max_requests_per_second,
spawning_url=self.urls_scan_config.spawning_url,
dataset_scripts_allow_list=self.app_config.common.dataset_scripts_allow_list,
)
)
40 changes: 40 additions & 0 deletions services/worker/src/worker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,17 @@
import traceback
import warnings
from collections.abc import Callable, Sequence
from contextlib import AbstractContextManager
from fnmatch import fnmatch
from typing import Any, Optional, TypeVar, Union, cast
from unittest.mock import patch
from urllib.parse import quote

import datasets
import PIL
import requests
from datasets import Dataset, DatasetInfo, DownloadConfig, IterableDataset, load_dataset
from datasets.load import HubDatasetModuleFactoryWithScript
from datasets.utils.file_utils import get_authentication_headers_for_url
from fsspec.implementations.http import HTTPFileSystem
from huggingface_hub.hf_api import HfApi
Expand All @@ -23,6 +28,7 @@
from libcommon.exceptions import (
ConfigNotFoundError,
DatasetNotFoundError,
DatasetWithScriptNotSupportedError,
NormalRowsError,
PreviousStepFormatError,
SplitNotFoundError,
Expand Down Expand Up @@ -372,3 +378,37 @@ def is_dataset_script_error() -> bool:
(t, v, tb) = sys.exc_info()
cause_traceback: list[str] = traceback.format_exception(t, v, tb)
return any(EXTERNAL_DATASET_SCRIPT_PATTERN in cause for cause in cause_traceback)


def disable_dataset_scripts_support(allow_list: list[str]) -> AbstractContextManager[Any]:
original_init = HubDatasetModuleFactoryWithScript.__init__

def raise_unsupported_dataset_with_script_or_init(
self: HubDatasetModuleFactoryWithScript,
name: str,
revision: Optional[Union[str, datasets.Version]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[Union[datasets.DownloadMode, str]] = None,
dynamic_modules_path: Optional[str] = None,
) -> None:
for allowed_pattern in allow_list:
if (allowed_pattern == "{{ALL_DATASETS_WITH_NO_NAMESPACE}}" and "/" not in name) or fnmatch(
name, allowed_pattern
):
break
else:
raise DatasetWithScriptNotSupportedError(
"The dataset viewer doesn't support this dataset because it runs "
"arbitrary python code. Please open a discussion in the discussion tab "
"if you think this is an error and tag @lhoestq and @severo."
)
original_init(
self=self,
name=name,
revision=revision,
download_config=download_config,
download_mode=download_mode,
dynamic_modules_path=dynamic_modules_path,
)

return patch.object(HubDatasetModuleFactoryWithScript, "__init__", raise_unsupported_dataset_with_script_or_init)
13 changes: 13 additions & 0 deletions services/worker/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,19 @@ def monkeypatch_session() -> Iterator[MonkeyPatch]:
mp.undo()


@fixture()
def use_hub_prod_endpoint() -> Iterator[MonkeyPatch]:
mp = MonkeyPatch()
mp.setattr(
"huggingface_hub.file_download.HUGGINGFACE_CO_URL_TEMPLATE",
"https://huggingface.co/{repo_id}/resolve/{revision}/{filename}",
)
# ^ see https://github.com/huggingface/datasets/pull/5196#issuecomment-1322191056
mp.setattr("datasets.config.HF_ENDPOINT", "https://huggingface.co")
yield mp
mp.undo()


# see https://github.com/pytest-dev/pytest/issues/363#issuecomment-406536200
@fixture
def set_env_vars(
Expand Down
Loading