Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove support for script-based datasets #3046

Merged
merged 4 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions chart/templates/_env/_envCommon.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
{{- define "envCommon" -}}
- name: COMMON_BLOCKED_DATASETS
value: {{ .Values.common.blockedDatasets | quote}}
- name: COMMON_DATASET_SCRIPTS_ALLOW_LIST
value: {{ .Values.common.datasetScriptsAllowList | quote}}
- name: COMMON_HF_ENDPOINT
value: {{ include "datasetsServer.hub.url" . }}
- name: HF_ENDPOINT # see https://github.com/huggingface/datasets/pull/5196#issuecomment-1322191411
Expand Down
2 changes: 0 additions & 2 deletions chart/templates/_env/_envWorker.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@
{{- end }}
- name: PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES
value: {{ .Values.parquetAndInfo.maxDatasetSizeBytes | quote }}
- name: PARQUET_AND_INFO_MAX_EXTERNAL_DATA_FILES
value: {{ .Values.parquetAndInfo.maxExternalDataFiles | quote }}
- name: PARQUET_AND_INFO_MAX_ROW_GROUP_BYTE_SIZE_FOR_COPY
value: {{ .Values.parquetAndInfo.maxRowGroupByteSizeForCopy | quote }}
- name: PARQUET_AND_INFO_SOURCE_REVISION
Expand Down
3 changes: 0 additions & 3 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,6 @@ images:
common:
# Comma-separated list of blocked datasets. No jobs will be processed for those datasets.
blockedDatasets: "open-llm-leaderboard-old/details_*,lunaluan/*,atom-in-the-universe/*,cot-leaderboard/cot-eval-traces,mitermix/yt-links,mcding-org/*"
# Comma-separated list of the datasets for which we support dataset scripts.
# Unix shell-style wildcards also work in the dataset name for namespaced datasets, for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace.
datasetScriptsAllowList: "hf-internal-testing/dataset_with_script,togethercomputer/RedPajama-Data-1T,togethercomputer/RedPajama-Data-V2,gaia-benchmark/GAIA,poloclub/diffusiondb,mozilla-foundation/common_voice_*,google/fleurs,speechcolab/gigaspeech,espnet/yodas"
# URL of the HuggingFace Hub
hfEndpoint: ""

Expand Down
10 changes: 1 addition & 9 deletions docs/source/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,7 @@
{
"$ref": "#/components/schemas/X-Error-Code-ResponseNotReadyError"
},
{ "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" },
{
"$ref": "#/components/schemas/X-Error-Code-ExternalFilesSizeRequestHTTPError"
}
{ "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" }
]
},
"required": false
Expand Down Expand Up @@ -1311,11 +1308,6 @@
"const": "ExternalAuthenticatedError",
"description": "Raised when the external authentication check failed while the user was authenticated. Even if the external authentication server returns 403 in that case, we return 404 because we don't know if the dataset exist or not. It's also coherent with how the Hugging Face Hub works."
},
"X-Error-Code-ExternalFilesSizeRequestHTTPError": {
"type": "string",
"const": "ExternalFilesSizeRequestHTTPError",
"description": "We failed to get the size of the external files."
},
"X-Error-Code-ExternalUnauthenticatedError": {
"type": "string",
"const": "ExternalUnauthenticatedError",
Expand Down
1 change: 0 additions & 1 deletion libs/libcommon/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ To enable signed URLs on the StorageClient, `CLOUDFRONT_KEY_PAIR_ID` and `CLOUDF
Set the common environment variables to configure the following aspects:

- `COMMON_BLOCKED_DATASETS`: comma-separated list of the blocked datasets. Unix shell-style wildcards also work in the dataset name for namespaced datasets, for example `some_namespace/*` to block all the datasets in the `some_namespace` namespace. If empty, no dataset is blocked. Defaults to empty.
- `COMMON_DATASET_SCRIPTS_ALLOW_LIST`: comma-separated list of the datasets for which we support dataset scripts. Unix shell-style wildcards also work in the dataset name for namespaced datasets, for example `some_namespace/*` to refer to all the datasets in the `some_namespace` namespace. If empty, no dataset with script is supported. Defaults to empty.
- `COMMON_HF_ENDPOINT`: URL of the HuggingFace Hub. Defaults to `https://huggingface.co`.
- `COMMON_HF_TOKEN`: App Access Token (ask moonlanding administrators to get one, only the `read` role is required) to access the gated datasets. Defaults to empty.

Expand Down
5 changes: 0 additions & 5 deletions libs/libcommon/src/libcommon/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,13 @@ def from_env(cls) -> "RowsIndexConfig":


COMMON_BLOCKED_DATASETS: list[str] = []
COMMON_DATASET_SCRIPTS_ALLOW_LIST: list[str] = []
COMMON_HF_ENDPOINT = "https://huggingface.co"
COMMON_HF_TOKEN = None


@dataclass(frozen=True)
class CommonConfig:
blocked_datasets: list[str] = field(default_factory=COMMON_BLOCKED_DATASETS.copy)
dataset_scripts_allow_list: list[str] = field(default_factory=COMMON_DATASET_SCRIPTS_ALLOW_LIST.copy)
hf_endpoint: str = COMMON_HF_ENDPOINT
hf_token: Optional[str] = COMMON_HF_TOKEN

Expand All @@ -163,9 +161,6 @@ def from_env(cls) -> "CommonConfig":
with env.prefixed("COMMON_"):
return cls(
blocked_datasets=env.list(name="BLOCKED_DATASETS", default=COMMON_BLOCKED_DATASETS.copy()),
dataset_scripts_allow_list=env.list(
name="DATASET_SCRIPTS_ALLOW_LIST", default=COMMON_DATASET_SCRIPTS_ALLOW_LIST.copy()
),
hf_endpoint=env.str(name="HF_ENDPOINT", default=COMMON_HF_ENDPOINT),
hf_token=env.str(name="HF_TOKEN", default=COMMON_HF_TOKEN), # nosec
)
Expand Down
10 changes: 8 additions & 2 deletions libs/libcommon/src/libcommon/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@
DEFAULT_MAX_FAILED_RUNS = 3
LARGE_MAX_FAILED_RUNS = 30 # for errors that should not be permanent
MAX_FAILED_RUNS_PER_ERROR_CODE = {
# 20240823 - temporary until all these deprecated errors disappear
"UnsupportedExternalFilesError": DEFAULT_MAX_FAILED_RUNS,
"ExternalFilesSizeRequestError": DEFAULT_MAX_FAILED_RUNS,
"ExternalFilesSizeRequestHTTPError": DEFAULT_MAX_FAILED_RUNS,
"ExternalFilesSizeRequestConnectionError": DEFAULT_MAX_FAILED_RUNS,
"ExternalFilesSizeRequestTimeoutError": DEFAULT_MAX_FAILED_RUNS,
"DatasetModuleNotInstalledError": DEFAULT_MAX_FAILED_RUNS,
"DatasetScriptError": DEFAULT_MAX_FAILED_RUNS,
# default
"RetryableConfigNamesError": DEFAULT_MAX_FAILED_RUNS,
"ConnectionError": DEFAULT_MAX_FAILED_RUNS,
Expand All @@ -56,8 +64,6 @@
}
ERROR_CODES_TO_RETRY = list(MAX_FAILED_RUNS_PER_ERROR_CODE.keys())

EXTERNAL_DATASET_SCRIPT_PATTERN = "datasets_modules/datasets"

# Arrays are not immutable, we have to take care of not modifying them
# Anyway: in all this file, we allow constant reassignment (no use of Final)
CONFIG_HAS_VIEWER_KIND = "config-size"
Expand Down
56 changes: 0 additions & 56 deletions libs/libcommon/src/libcommon/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,7 @@ def as_response(self) -> ErrorResponse:
"DatasetGenerationError",
"DatasetGenerationCastError",
"DatasetInBlockListError",
"DatasetModuleNotInstalledError",
"DatasetNotFoundError",
"DatasetScriptError",
"DatasetWithScriptNotSupportedError",
"DatasetWithTooComplexDataFilesPatternsError",
"DatasetWithTooManyConfigsError",
Expand All @@ -93,10 +91,6 @@ def as_response(self) -> ErrorResponse:
"DiskError",
"DuckDBIndexFileNotFoundError",
"EmptyDatasetError",
"ExternalFilesSizeRequestConnectionError",
"ExternalFilesSizeRequestError",
"ExternalFilesSizeRequestHTTPError",
"ExternalFilesSizeRequestTimeoutError",
"ExternalServerError",
"FeaturesError",
"FeaturesResponseEmptyError",
Expand Down Expand Up @@ -134,7 +128,6 @@ def as_response(self) -> ErrorResponse:
"TooLongColumnNameError",
"TooManyColumnsError",
"UnexpectedError",
"UnsupportedExternalFilesError",
]


Expand Down Expand Up @@ -209,13 +202,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetGenerationCastError", cause, True)


class DatasetModuleNotInstalledError(CacheableError):
"""The dataset tries to import a module that is not installed."""

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetModuleNotInstalledError", cause, True)


class DatasetNotFoundError(CacheableError):
"""The dataset does not exist."""

Expand All @@ -229,13 +215,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
)


class DatasetScriptError(CacheableError):
"""The dataset script generated an error."""

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetScriptError", cause, False)


class DatasetWithTooManyConfigsError(CacheableError):
"""The number of configs of a dataset exceeded the limit."""

Expand Down Expand Up @@ -284,34 +263,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "EmptyDatasetError", cause, True)


class ExternalFilesSizeRequestConnectionError(CacheableError):
"""We failed to get the size of the external files."""

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "ExternalFilesSizeRequestConnectionError", cause, False)


class ExternalFilesSizeRequestError(CacheableError):
"""We failed to get the size of the external files."""

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "ExternalFilesSizeRequestError", cause, False)


class ExternalFilesSizeRequestHTTPError(CacheableError):
"""We failed to get the size of the external files."""

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "ExternalFilesSizeRequestHTTPError", cause, False)


class ExternalFilesSizeRequestTimeoutError(CacheableError):
"""We failed to get the size of the external files."""

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "ExternalFilesSizeRequestTimeoutError", cause, False)


class ExternalServerError(CacheableError):
"""The spawning.ai server is not responding."""

Expand Down Expand Up @@ -574,13 +525,6 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
logging.error(message, exc_info=cause)


class UnsupportedExternalFilesError(CacheableError):
"""We failed to get the size of the external files."""

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedExternalFilesError", cause, False)


class DatasetWithScriptNotSupportedError(CacheableError):
"""We don't support some datasets because they have a dataset script."""

Expand Down
1 change: 0 additions & 1 deletion services/worker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ Set environment variables to configure the `parquet-and-info` worker (`PARQUET_A
- `PARQUET_AND_INFO_COMMIT_MESSAGE`: the git commit message when the worker uploads the parquet files to the Hub. Defaults to `Update parquet files`.
- `PARQUET_AND_INFO_COMMITTER_HF_TOKEN`: the HuggingFace token to commit the parquet files to the Hub. The token must be an app token associated with a user that has the right to 1. create the `refs/convert/parquet` branch (see `PARQUET_AND_INFO_TARGET_REVISION`) and 2. push commits to it on any dataset. [Datasets maintainers](https://huggingface.co/datasets-maintainers) members have these rights. The token must have permission to write. If not set, the worker will fail. Defaults to None.
- `PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES`: the maximum size in bytes of the dataset to pre-compute the parquet files. Bigger datasets, or datasets without that information, are partially streamed to get parquet files up to this value. Defaults to `100_000_000`.
- `PARQUET_AND_INFO_MAX_EXTERNAL_DATA_FILES`: the maximum number of external files of the datasets. Bigger datasets, or datasets without that information, are partially streamed to get parquet files up to `PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES` bytes. Defaults to `10_000`.
- `PARQUET_AND_INFO_MAX_ROW_GROUP_BYTE_SIZE_FOR_COPY`: the maximum size in bytes of the row groups of parquet datasets that are copied to the target revision. Bigger datasets, or datasets without that information, are partially streamed to get parquet files up to `PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES` bytes. Defaults to `100_000_000`.
- `PARQUET_AND_INFO_SOURCE_REVISION`: the git revision of the dataset to use to prepare the parquet files. Defaults to `main`.
- `PARQUET_AND_INFO_TARGET_REVISION`: the git revision of the dataset where to store the parquet files. Make sure the committer token (`PARQUET_AND_INFO_COMMITTER_HF_TOKEN`) has the permission to write there. Defaults to `refs/convert/parquet`.
Expand Down
5 changes: 0 additions & 5 deletions services/worker/src/worker/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ def from_env(cls) -> "PresidioEntitiesScanConfig":
PARQUET_AND_INFO_COMMIT_MESSAGE = "Update parquet files"
PARQUET_AND_INFO_COMMITTER_HF_TOKEN = None
PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES = 100_000_000
PARQUET_AND_INFO_MAX_EXTERNAL_DATA_FILES = 10_000
PARQUET_AND_INFO_MAX_ROW_GROUP_BYTE_SIZE_FOR_COPY = 100_000_000
PARQUET_AND_INFO_SOURCE_REVISION = "main"
PARQUET_AND_INFO_TARGET_REVISION = "refs/convert/parquet"
Expand All @@ -221,7 +220,6 @@ class ParquetAndInfoConfig:
commit_message: str = PARQUET_AND_INFO_COMMIT_MESSAGE
committer_hf_token: Optional[str] = PARQUET_AND_INFO_COMMITTER_HF_TOKEN
max_dataset_size_bytes: int = PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES
max_external_data_files: int = PARQUET_AND_INFO_MAX_EXTERNAL_DATA_FILES
max_row_group_byte_size_for_copy: int = PARQUET_AND_INFO_MAX_ROW_GROUP_BYTE_SIZE_FOR_COPY
source_revision: str = PARQUET_AND_INFO_SOURCE_REVISION
target_revision: str = PARQUET_AND_INFO_TARGET_REVISION
Expand All @@ -237,9 +235,6 @@ def from_env(cls) -> "ParquetAndInfoConfig":
max_dataset_size_bytes=env.int(
name="MAX_DATASET_SIZE_BYTES", default=PARQUET_AND_INFO_MAX_DATASET_SIZE_BYTES
),
max_external_data_files=env.int(
name="MAX_EXTERNAL_DATA_FILES", default=PARQUET_AND_INFO_MAX_EXTERNAL_DATA_FILES
),
max_row_group_byte_size_for_copy=env.int(
name="MAX_ROW_GROUP_BYTE_SIZE_FOR_COPY", default=PARQUET_AND_INFO_MAX_ROW_GROUP_BYTE_SIZE_FOR_COPY
),
Expand Down
10 changes: 1 addition & 9 deletions services/worker/src/worker/job_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from libcommon.exceptions import (
CustomError,
DatasetNotFoundError,
DatasetScriptError,
JobManagerCrashedError,
JobManagerExceededMaximumDurationError,
PreviousStepStillProcessingError,
Expand All @@ -27,7 +26,6 @@

from worker.config import AppConfig, WorkerConfig
from worker.job_runner import JobRunner
from worker.utils import is_dataset_script_error


class JobManager:
Expand Down Expand Up @@ -187,13 +185,7 @@ def process(
"duration": get_duration_or_none(started_at),
}
except Exception as err:
e = (
err
if isinstance(err, CustomError)
else DatasetScriptError(str(err), err)
if is_dataset_script_error()
else UnexpectedError(str(err), err)
)
e = err if isinstance(err, CustomError) else UnexpectedError(str(err), err)
self.debug(f"response for job_info={self.job_info} had an error")
return {
"job_info": self.job_info,
Expand Down
Loading
Loading