Skip to content

Commit

Permalink
Raise specific errors instead of ConfigNamesError when appropriate (#…
Browse files Browse the repository at this point in the history
…3041)

* this case should not exist (and we have no occurrence in the db) - let it raise

* add a specific error code when data files are missing

* give a specific exception for common error

* for the remaining errors, let retry, it should fix some of them

* retry some causes

* follow naming convention + pass a message

* temporary, to recompute the errors

* ConnectionError is a standard error

* same with PermissionError

* fix tests
  • Loading branch information
severo authored Aug 22, 2024
1 parent 0434dc6 commit e7fa2e0
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 9 deletions.
2 changes: 2 additions & 0 deletions libs/libcommon/src/libcommon/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
LARGE_MAX_FAILED_RUNS = 30 # for errors that should not be permanent
MAX_FAILED_RUNS_PER_ERROR_CODE = {
# default
"ConfigNamesError": DEFAULT_MAX_FAILED_RUNS, # <- 20240822: to recompute all of them in the next backfill
"RetryableConfigNamesError": DEFAULT_MAX_FAILED_RUNS,
"ConnectionError": DEFAULT_MAX_FAILED_RUNS,
"ExternalServerError": DEFAULT_MAX_FAILED_RUNS,
"JobManagerCrashedError": DEFAULT_MAX_FAILED_RUNS,
Expand Down
26 changes: 26 additions & 0 deletions libs/libcommon/src/libcommon/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def as_response(self) -> ErrorResponse:
"ConfigNamesError",
"ConfigNotFoundError",
"CreateCommitError",
"DataFilesNotFoundError",
"DatasetGenerationError",
"DatasetGenerationCastError",
"DatasetInBlockListError",
Expand All @@ -99,6 +100,7 @@ def as_response(self) -> ErrorResponse:
"ExternalServerError",
"FeaturesError",
"FeaturesResponseEmptyError",
"FileFormatMismatchBetweenSplitsError",
"FileSystemError",
"HfHubError",
"InfoError",
Expand All @@ -120,6 +122,7 @@ def as_response(self) -> ErrorResponse:
"PreviousStepStatusError",
"PreviousStepStillProcessingError",
"PolarsParquetReadError",
"RetryableConfigNamesError",
"RowsPostProcessingError",
"SplitsNamesError",
"SplitNamesFromStreamingError",
Expand Down Expand Up @@ -185,6 +188,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "CreateCommitError", cause, False)


class DataFilesNotFoundError(CacheableError):
"""No (supported) data files found."""

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DataFilesNotFoundError", cause, False)


class DatasetGenerationError(CacheableError):
"""The dataset generation failed."""

Expand Down Expand Up @@ -323,6 +333,15 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "FeaturesResponseEmptyError", cause, True)


class FileFormatMismatchBetweenSplitsError(CacheableError):
"""Couldn't infer the same data file format for all splits."""

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(
message, HTTPStatus.INTERNAL_SERVER_ERROR, "FileFormatMismatchBetweenSplitsError", cause, False
)


class FileSystemError(CacheableError):
"""An error happen reading from File System."""

Expand Down Expand Up @@ -446,6 +465,13 @@ def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "PreviousStepStillProcessingError", cause, False)


class RetryableConfigNamesError(CacheableError):
"""The config names could not be fetched, but we should retry."""

def __init__(self, message: str, cause: Optional[BaseException] = None):
super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "RetryableConfigNamesError", cause, True)


class RowsPostProcessingError(CacheableError):
"""The rows could not be post-processed successfully."""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
)
from libcommon.dtos import JobInfo, SplitHubFile
from libcommon.exceptions import (
ConfigNamesError,
CreateCommitError,
DatasetGenerationCastError,
DatasetGenerationError,
Expand Down Expand Up @@ -1408,8 +1407,6 @@ def compute_config_parquet_and_info_response(
If one of the commits could not be created on the Hub.
[~`libcommon.exceptions.EmptyDatasetError`]:
The dataset is empty.
[~`libcommon.exceptions.ConfigNamesError`]:
If the list of configurations could not be obtained using the datasets library.
[~`libcommon.exceptions.UnsupportedExternalFilesError`]:
If we failed to get the external files sizes to make sure we can convert the dataset to parquet
[~`libcommon.exceptions.ExternalFilesSizeRequestHTTPError`]:
Expand Down Expand Up @@ -1450,7 +1447,7 @@ def compute_config_parquet_and_info_response(

config_names = {config_name_item["config"] for config_name_item in config_names_content["config_names"]}
if config not in config_names:
raise ConfigNamesError(f"{config=} does not exist in {dataset=}")
raise ValueError(f"{config=} does not exist in {dataset=}")

hf_api = HfApi(endpoint=hf_endpoint, token=hf_token)
committer_hf_api = HfApi(endpoint=hf_endpoint, token=committer_hf_token)
Expand Down
15 changes: 14 additions & 1 deletion services/worker/src/worker/job_runners/dataset/config_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,20 @@
load_dataset_builder,
)
from datasets.data_files import EmptyDatasetError as _EmptyDatasetError
from datasets.exceptions import DefunctDatasetError
from datasets.exceptions import (
DataFilesNotFoundError as _DataFilesNotFoundError,
)
from datasets.exceptions import DatasetNotFoundError, DefunctDatasetError
from huggingface_hub.utils import HfHubHTTPError
from libcommon.exceptions import (
ConfigNamesError,
DataFilesNotFoundError,
DatasetModuleNotInstalledError,
DatasetWithScriptNotSupportedError,
DatasetWithTooManyConfigsError,
EmptyDatasetError,
FileFormatMismatchBetweenSplitsError,
RetryableConfigNamesError,
)

from worker.dtos import CompleteJobResult, ConfigNameItem, DatasetConfigNamesResponse
Expand Down Expand Up @@ -107,10 +114,16 @@ def compute_config_names_response(
]
except _EmptyDatasetError as err:
raise EmptyDatasetError("The dataset is empty.", cause=err) from err
except _DataFilesNotFoundError as err:
raise DataFilesNotFoundError(str(err), cause=err) from err
except ValueError as err:
if "trust_remote_code" in str(err):
raise DatasetWithScriptNotSupportedError from err
if "Couldn't infer the same data file format for all splits" in str(err):
raise FileFormatMismatchBetweenSplitsError(str(err), cause=err) from err
raise ConfigNamesError("Cannot get the config names for the dataset.", cause=err) from err
except (HfHubHTTPError, BrokenPipeError, DatasetNotFoundError, PermissionError, ConnectionError) as err:
raise RetryableConfigNamesError("Cannot get the config names for the dataset.", cause=err) from err
except ImportError as err:
# this should only happen if the dataset is in the allow list, which should soon disappear
raise DatasetModuleNotInstalledError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from collections.abc import Callable
from dataclasses import replace
from typing import Optional
from unittest.mock import patch

import pytest
Expand Down Expand Up @@ -98,9 +99,9 @@ def test_compute_too_many_configs(
("n_configs_with_default", False, None, None),
# should we really test the following cases?
# The assumption is that the dataset exists and is accessible with the token
("does_not_exist", False, "ConfigNamesError", "DatasetNotFoundError"),
("gated", False, "ConfigNamesError", "ConnectionError"), # See: huggingface/datasets#7109
("private", False, "ConfigNamesError", "DatasetNotFoundError"),
("does_not_exist", False, "RetryableConfigNamesError", "DatasetNotFoundError"),
("gated", False, "RetryableConfigNamesError", "ConnectionError"), # See: huggingface/datasets#7109
("private", False, "RetryableConfigNamesError", "DatasetNotFoundError"),
],
)
def test_compute_splits_response(
Expand All @@ -114,7 +115,7 @@ def test_compute_splits_response(
get_job_runner: GetJobRunner,
name: str,
use_token: bool,
error_code: str,
error_code: Optional[str],
cause: str,
app_config: AppConfig,
) -> None:
Expand Down

0 comments on commit e7fa2e0

Please sign in to comment.