From 185e6b3579e1fcad019a5df7895120620cdefdcd Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 30 Oct 2024 17:16:06 +0100 Subject: [PATCH 01/12] remove datasets required dependency --- setup.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup.py b/setup.py index 82892bfcc8c..accf6adf148 100644 --- a/setup.py +++ b/setup.py @@ -13,14 +13,11 @@ REQUIRED_PKGS = [ - "coloredlogs", - "sympy", "transformers>=4.29", "torch>=1.11", "packaging", "numpy", "huggingface_hub>=0.8.0", - "datasets", ] # TODO: unpin pytest once https://github.com/huggingface/transformers/pull/29154 is merged & released From be44770ab65b5b9cf3fd7f7a1f1a9bb494c2185d Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 31 Oct 2024 17:09:43 +0100 Subject: [PATCH 02/12] install datasets when needed --- .github/workflows/dev_test_benckmark.yml | 2 +- .github/workflows/test_benckmark.yml | 2 +- .github/workflows/test_utils.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/dev_test_benckmark.yml b/.github/workflows/dev_test_benckmark.yml index 5f6fc825021..add6f6fd2da 100644 --- a/.github/workflows/dev_test_benckmark.yml +++ b/.github/workflows/dev_test_benckmark.yml @@ -27,7 +27,7 @@ jobs: - name: Install dependencies run: | pip install wheel - pip install .[tests,onnxruntime,benchmark] + pip install .[tests,onnxruntime,benchmark] datasets pip install -U git+https://github.com/huggingface/evaluate pip install -U git+https://github.com/huggingface/diffusers pip install -U git+https://github.com/huggingface/transformers diff --git a/.github/workflows/test_benckmark.yml b/.github/workflows/test_benckmark.yml index e859e845d64..fe7df1a20cc 100644 --- a/.github/workflows/test_benckmark.yml +++ b/.github/workflows/test_benckmark.yml @@ -30,7 +30,7 @@ jobs: - name: Install dependencies run: | pip install wheel - pip install .[tests,onnxruntime,benchmark] + pip install .[tests,onnxruntime,benchmark] datasets - name: Test with unittest run: | python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py' diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml index b5f2e27fc6a..4e4d2428967 100644 --- a/.github/workflows/test_utils.yml +++ b/.github/workflows/test_utils.yml @@ -32,7 +32,7 @@ jobs: run: | python -m pip install --upgrade pip pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install .[tests] + pip install .[tests] datasets - name: Test with pytest working-directory: tests From bac8608acba3468e666b1d40a906f017a96b9778 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 31 Oct 2024 17:09:50 +0100 Subject: [PATCH 03/12] fix --- optimum/onnxruntime/model.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/optimum/onnxruntime/model.py b/optimum/onnxruntime/model.py index caa662f3824..4182abc925f 100644 --- a/optimum/onnxruntime/model.py +++ b/optimum/onnxruntime/model.py @@ -14,10 +14,9 @@ import logging import os -from typing import Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union import numpy as np -from datasets import Dataset from transformers import EvalPrediction from transformers.trainer_pt_utils import nested_concat from transformers.trainer_utils import EvalLoopOutput @@ -25,6 +24,10 @@ from onnxruntime import InferenceSession +if TYPE_CHECKING: + from datasets import Dataset + + logger = logging.getLogger(__name__) @@ -59,7 +62,7 @@ def __init__( self.session = InferenceSession(str(model_path), providers=[execution_provider]) self.onnx_input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} - def evaluation_loop(self, dataset: Dataset): + def evaluation_loop(self, dataset: "Dataset"): """ Run evaluation and returns metrics and predictions. From 64e5b9f3b17c1fc695bd1547b2de59b7c32dc3ba Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 31 Oct 2024 17:41:05 +0100 Subject: [PATCH 04/12] fixes --- optimum/onnxruntime/configuration.py | 15 ++++++++++----- optimum/onnxruntime/quantization.py | 21 +++++++++++++++------ optimum/onnxruntime/runs/calibrator.py | 10 ++++++---- optimum/runs_base.py | 8 +++++--- optimum/utils/import_utils.py | 5 +++++ 5 files changed, 41 insertions(+), 18 deletions(-) diff --git a/optimum/onnxruntime/configuration.py b/optimum/onnxruntime/configuration.py index 2e3d9f32d6a..adc1984795a 100644 --- a/optimum/onnxruntime/configuration.py +++ b/optimum/onnxruntime/configuration.py @@ -18,9 +18,8 @@ from dataclasses import asdict, dataclass, field from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -from datasets import Dataset from packaging.version import Version, parse from onnxruntime import __version__ as ort_version @@ -33,6 +32,10 @@ from ..utils import logging +if TYPE_CHECKING: + from datasets import Dataset + + logger = logging.get_logger(__name__) # This value is used to indicate ORT which axis it should use to quantize an operator "per-channel" @@ -117,7 +120,9 @@ def create_calibrator( class AutoCalibrationConfig: @staticmethod - def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: float = 0.01) -> CalibrationConfig: + def minmax( + dataset: "Dataset", moving_average: bool = False, averaging_constant: float = 0.01 + ) -> CalibrationConfig: """ Args: dataset (`Dataset`): @@ -151,7 +156,7 @@ def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: f @staticmethod def entropy( - dataset: Dataset, + dataset: "Dataset", num_bins: int = 128, num_quantized_bins: int = 128, ) -> CalibrationConfig: @@ -188,7 +193,7 @@ def entropy( ) @staticmethod - def percentiles(dataset: Dataset, num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig: + def percentiles(dataset: "Dataset", num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig: """ Args: dataset (`Dataset`): diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py index 056123f8d8e..5c53154b62e 100644 --- a/optimum/onnxruntime/quantization.py +++ b/optimum/onnxruntime/quantization.py @@ -21,7 +21,6 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union import onnx -from datasets import Dataset, load_dataset from packaging.version import Version, parse from transformers import AutoConfig @@ -29,6 +28,7 @@ from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantizationMode, QuantType from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer from onnxruntime.quantization.qdq_quantizer import QDQQuantizer +from optimum.utils.import_utils import is_datasets_available from ..quantization_base import OptimumQuantizer from ..utils.save_utils import maybe_save_preprocessors @@ -40,6 +40,7 @@ if TYPE_CHECKING: + from datasets import Dataset from transformers import PretrainedConfig LOGGER = logging.getLogger(__name__) @@ -48,7 +49,7 @@ class ORTCalibrationDataReader(CalibrationDataReader): __slots__ = ["batch_size", "dataset", "_dataset_iter"] - def __init__(self, dataset: Dataset, batch_size: int = 1): + def __init__(self, dataset: "Dataset", batch_size: int = 1): if dataset is None: raise ValueError("Provided dataset is None.") @@ -157,7 +158,7 @@ def from_pretrained( def fit( self, - dataset: Dataset, + dataset: "Dataset", calibration_config: CalibrationConfig, onnx_augmented_model_name: Union[str, Path] = "augmented_model.onnx", operators_to_quantize: Optional[List[str]] = None, @@ -211,7 +212,7 @@ def fit( def partial_fit( self, - dataset: Dataset, + dataset: "Dataset", calibration_config: CalibrationConfig, onnx_augmented_model_name: Union[str, Path] = "augmented_model.onnx", operators_to_quantize: Optional[List[str]] = None, @@ -427,7 +428,7 @@ def get_calibration_dataset( seed: int = 2016, use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, - ) -> Dataset: + ) -> "Dataset": """ Creates the calibration `datasets.Dataset` to use for the post-training static quantization calibration step. @@ -473,6 +474,14 @@ def get_calibration_dataset( "provided." ) + if not is_datasets_available(): + raise ImportError( + "`get_calibration_dataset` requires the datasets library but it was not found in your environment. " + "You can install it with pip: `pip install datasets`." + ) + + from datasets import load_dataset + calib_dataset = load_dataset( dataset_name, name=dataset_config_name, @@ -491,7 +500,7 @@ def get_calibration_dataset( return self.clean_calibration_dataset(processed_calib_dataset) - def clean_calibration_dataset(self, dataset: Dataset) -> Dataset: + def clean_calibration_dataset(self, dataset: "Dataset") -> "Dataset": model = onnx.load(self.onnx_model_path) model_inputs = {input.name for input in model.graph.input} ignored_columns = list(set(dataset.column_names) - model_inputs) diff --git a/optimum/onnxruntime/runs/calibrator.py b/optimum/onnxruntime/runs/calibrator.py index c493a943747..bfdcd64d92e 100644 --- a/optimum/onnxruntime/runs/calibrator.py +++ b/optimum/onnxruntime/runs/calibrator.py @@ -1,6 +1,4 @@ -from typing import Dict, List - -from datasets import Dataset +from typing import TYPE_CHECKING, Dict, List from ...runs_base import Calibrator from .. import ORTQuantizer @@ -9,10 +7,14 @@ from ..preprocessors.passes import ExcludeGeLUNodes, ExcludeLayerNormNodes, ExcludeNodeAfter, ExcludeNodeFollowedBy +if TYPE_CHECKING: + from datasets import Dataset + + class OnnxRuntimeCalibrator(Calibrator): def __init__( self, - calibration_dataset: Dataset, + calibration_dataset: "Dataset", quantizer: ORTQuantizer, model_path: str, qconfig: QuantizationConfig, diff --git a/optimum/runs_base.py b/optimum/runs_base.py index 3a1d164c602..dadd445818f 100644 --- a/optimum/runs_base.py +++ b/optimum/runs_base.py @@ -2,13 +2,12 @@ import subprocess from contextlib import contextmanager from time import perf_counter_ns -from typing import Set +from typing import TYPE_CHECKING, Set import numpy as np import optuna import torch import transformers -from datasets import Dataset from tqdm import trange from . import version as optimum_version @@ -21,6 +20,9 @@ from .utils.runs import RunConfig, cpu_info_command +if TYPE_CHECKING: + from datasets import Dataset + os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -34,7 +36,7 @@ def get_autoclass_name(task): class Calibrator: def __init__( - self, calibration_dataset: Dataset, quantizer, model_path, qconfig, calibration_params, node_exclusion + self, calibration_dataset: "Dataset", quantizer, model_path, qconfig, calibration_params, node_exclusion ): self.calibration_dataset = calibration_dataset self.quantizer = quantizer diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 35a6294ab52..c3b9f4f57bf 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -69,6 +69,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ _auto_gptq_available = _is_package_available("auto_gptq") _timm_available = _is_package_available("timm") _sentence_transformers_available = _is_package_available("sentence_transformers") +_datasets_available = _is_package_available("datasets") torch_version = None if is_torch_available(): @@ -131,6 +132,10 @@ def is_sentence_transformers_available(): return _sentence_transformers_available +def is_datasets_available(): + return _datasets_available + + def is_auto_gptq_available(): if _auto_gptq_available: version_autogptq = version.parse(importlib_metadata.version("auto_gptq")) From 81ad6bad22b55eb772aea0747b610ff93d69b63d Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 5 Nov 2024 10:19:57 +0100 Subject: [PATCH 05/12] add datasets installed when needed --- optimum/gptq/data.py | 17 ++++++++++++++++- optimum/gptq/quantizer.py | 2 +- optimum/onnxruntime/quantization.py | 8 ++------ optimum/utils/import_utils.py | 7 +++++++ 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/optimum/gptq/data.py b/optimum/gptq/data.py index b8734da478e..7b472e7efdf 100644 --- a/optimum/gptq/data.py +++ b/optimum/gptq/data.py @@ -18,7 +18,11 @@ import numpy as np import torch -from datasets import load_dataset +from optimum.utils.import_utils import is_datasets_available, DATASETS_IMPORT_ERROR + + +if is_datasets_available(): + from datasets import load_dataset """ @@ -113,6 +117,10 @@ def pad_block(block, pads): def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): + + if not is_datasets_available(): + raise ImportError(DATASETS_IMPORT_ERROR.format("get_wikitext2")) + if split == "train": data = load_dataset("wikitext", "wikitext-2-raw-v1", split="train") elif split == "validation": @@ -132,6 +140,9 @@ def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "trai def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): + if not is_datasets_available(): + raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4")) + if split == "train": data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}) elif split == "validation": @@ -157,6 +168,10 @@ def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): def get_c4_new(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): + + if not is_datasets_available(): + raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4_new")) + if split == "train": data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}) elif split == "validation": diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 949d4d260df..849d8821ebf 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -88,7 +88,7 @@ def __init__( dataset (`Union[List[str], str, Any]`, defaults to `None`): The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data (e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...]) - or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new']. + or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new']. group_size (int, defaults to 128): The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. damp_percent (`float`, defaults to `0.1`): diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py index 5c53154b62e..dca35849288 100644 --- a/optimum/onnxruntime/quantization.py +++ b/optimum/onnxruntime/quantization.py @@ -28,7 +28,7 @@ from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantizationMode, QuantType from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer from onnxruntime.quantization.qdq_quantizer import QDQQuantizer -from optimum.utils.import_utils import is_datasets_available +from optimum.utils.import_utils import requires_backends from ..quantization_base import OptimumQuantizer from ..utils.save_utils import maybe_save_preprocessors @@ -474,11 +474,7 @@ def get_calibration_dataset( "provided." ) - if not is_datasets_available(): - raise ImportError( - "`get_calibration_dataset` requires the datasets library but it was not found in your environment. " - "You can install it with pip: `pip install datasets`." - ) + requires_backends(self, ["datasets"]) from datasets import load_dataset diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index c3b9f4f57bf..405e3815b33 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -235,6 +235,12 @@ def require_numpy_strictly_lower(package_version: str, message: str): -U transformers`. Please note that you may need to restart your runtime after installation. """ +DATASETS_IMPORT_ERROR = """ +{0} requires the datasets library but it was not found in your environment. You can install it with pip: +`pip install datasets`. Please note that you may need to restart your runtime after installation. +""" + + BACKENDS_MAPPING = OrderedDict( [ ("diffusers", (is_diffusers_available, DIFFUSERS_IMPORT_ERROR)), @@ -250,6 +256,7 @@ def require_numpy_strictly_lower(package_version: str, message: str): "transformers_434", (lambda: check_if_transformers_greater("4.34"), "{0} " + TRANSFORMERS_IMPORT_ERROR.format("4.34")), ), + ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)), ] ) From 40967f29e08bd5d7132418a2f672936ab2bcdeb6 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 5 Nov 2024 10:20:14 +0100 Subject: [PATCH 06/12] style --- optimum/gptq/data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/optimum/gptq/data.py b/optimum/gptq/data.py index 7b472e7efdf..7e5fc0b43db 100644 --- a/optimum/gptq/data.py +++ b/optimum/gptq/data.py @@ -18,7 +18,8 @@ import numpy as np import torch -from optimum.utils.import_utils import is_datasets_available, DATASETS_IMPORT_ERROR + +from optimum.utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available if is_datasets_available(): @@ -117,7 +118,6 @@ def pad_block(block, pads): def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): - if not is_datasets_available(): raise ImportError(DATASETS_IMPORT_ERROR.format("get_wikitext2")) @@ -168,7 +168,6 @@ def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): def get_c4_new(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): - if not is_datasets_available(): raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4_new")) From da1e9f586f50089d0428c39c661355620c445845 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 5 Nov 2024 10:38:06 +0100 Subject: [PATCH 07/12] fix --- optimum/utils/preprocessing/base.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/optimum/utils/preprocessing/base.py b/optimum/utils/preprocessing/base.py index dc995ccc50b..19b4d9614c0 100644 --- a/optimum/utils/preprocessing/base.py +++ b/optimum/utils/preprocessing/base.py @@ -20,15 +20,16 @@ from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union -from datasets import Dataset, DatasetDict -from datasets import load_dataset as datasets_load_dataset from transformers import PreTrainedTokenizerBase from transformers.image_processing_utils import BaseImageProcessor +from optimum.utils.import_utils import requires_backends + from .. import logging if TYPE_CHECKING: + from datasets import Dataset, DatasetDict from transformers import PretrainedConfig @@ -102,11 +103,14 @@ def create_dataset_processing_func( def prepare_dataset( self, - dataset: Union[DatasetDict, Dataset], + dataset: Union["DatasetDict", "Dataset"], data_keys: Dict[str, str], ref_keys: Optional[List[str]] = None, split: Optional[str] = None, - ) -> Union[DatasetDict, Dataset]: + ) -> Union["DatasetDict", "Dataset"]: + requires_backends(self, ["datasets"]) + from datasets import Dataset + if isinstance(dataset, Dataset) and split is not None: raise ValueError("A Dataset and a split name were provided, but splits are for DatasetDict.") elif split is not None: @@ -131,7 +135,12 @@ def load_dataset( num_samples: Optional[int] = None, shuffle: bool = False, **load_dataset_kwargs, - ) -> Union[DatasetDict, Dataset]: + ) -> Union["DatasetDict", "Dataset"]: + requires_backends(self, ["datasets"]) + + from datasets import DatasetDict + from datasets import load_dataset as datasets_load_dataset + dataset = datasets_load_dataset(path, **load_dataset_kwargs) if isinstance(dataset, DatasetDict) and load_smallest_split: From bf70b764ed0bee8e39fa01bf77b9364e9765683a Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 5 Nov 2024 17:13:27 +0100 Subject: [PATCH 08/12] fix import --- optimum/utils/preprocessing/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/utils/preprocessing/base.py b/optimum/utils/preprocessing/base.py index 19b4d9614c0..7cfda13ba7d 100644 --- a/optimum/utils/preprocessing/base.py +++ b/optimum/utils/preprocessing/base.py @@ -138,7 +138,7 @@ def load_dataset( ) -> Union["DatasetDict", "Dataset"]: requires_backends(self, ["datasets"]) - from datasets import DatasetDict + from datasets import Dataset, DatasetDict from datasets import load_dataset as datasets_load_dataset dataset = datasets_load_dataset(path, **load_dataset_kwargs) From 2facb94f0ad5e4246eeb17c59db645e3a585056e Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 5 Nov 2024 17:13:40 +0100 Subject: [PATCH 09/12] add require dataset --- optimum/utils/testing_utils.py | 5 +++++ tests/utils/test_task_processors.py | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py index 76fe9a05b13..88b1acdb780 100644 --- a/optimum/utils/testing_utils.py +++ b/optimum/utils/testing_utils.py @@ -28,6 +28,7 @@ from . import ( is_accelerate_available, is_auto_gptq_available, + is_datasets_available, is_diffusers_available, is_sentence_transformers_available, is_timm_available, @@ -146,6 +147,10 @@ def require_sentence_transformers(test_case): return unittest.skipUnless(is_sentence_transformers_available(), "test requires sentence-transformers")(test_case) +def require_datasets(test_case): + return unittest.skipUnless(is_datasets_available(), "test requires datasets")(test_case) + + def grid_parameters( parameters: Dict[str, Iterable[Any]], yield_dict: bool = False, diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py index 16567048073..8337d3e477a 100644 --- a/tests/utils/test_task_processors.py +++ b/tests/utils/test_task_processors.py @@ -23,6 +23,7 @@ from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer from optimum.utils.preprocessing import TaskProcessorsManager +from optimum.utils.testing_utils import require_datasets if TYPE_CHECKING: @@ -122,6 +123,7 @@ def test_create_defaults_and_kwargs_from_preprocessor_kwargs_does_not_mutate_pre ) self.assertDictEqual(preprocessor_kwargs, clone) + @require_datasets def test_load_dataset_unallowed_data_keys(self): task_processor = TaskProcessorsManager.get_task_processor_class_for_task(self.TASK_NAME)( self.CONFIG, self.PREPROCESSOR @@ -188,15 +190,19 @@ def _test_load_dataset( return dataset + @require_datasets def test_load_dataset(self): return self._test_load_dataset(False, False, False) + @require_datasets def test_load_dataset_by_guessing_data_keys(self): return self._test_load_dataset(False, True, False) + @require_datasets def test_load_dataset_and_only_keep_necessary_columns(self): return self._test_load_dataset(False, False, True) + @require_datasets def test_load_default_dataset(self): return self._test_load_dataset(True, False, False) @@ -207,6 +213,7 @@ class TextClassificationProcessorTest(TestCase, TaskProcessorTestBase): PREPROCESSOR = TOKENIZER WRONG_PREPROCESSOR = IMAGE_PROCESSOR + @require_datasets def test_load_dataset_with_max_length(self): max_length = random.randint(4, 16) dataset = self._test_load_dataset(False, False, True, max_length=max_length) @@ -223,6 +230,7 @@ class TokenClassificationProcessorTest(TestCase, TaskProcessorTestBase): PREPROCESSOR = TOKENIZER WRONG_PREPROCESSOR = IMAGE_PROCESSOR + @require_datasets def test_load_dataset_with_max_length(self): max_length = random.randint(4, 16) dataset = self._test_load_dataset(False, False, True, max_length=max_length) @@ -232,6 +240,7 @@ def test_load_dataset_with_max_length(self): input_ids = dataset[0]["input_ids"] self.assertEqual(len(input_ids), max_length) + @require_datasets def test_load_default_dataset(self): self.skipTest( "Skipping so as not to execute conll2003 remote code (test would require trust_remote_code=True)" @@ -244,6 +253,7 @@ class QuestionAnsweringProcessorTest(TestCase, TaskProcessorTestBase): PREPROCESSOR = TOKENIZER WRONG_PREPROCESSOR = IMAGE_PROCESSOR + @require_datasets def test_load_dataset_with_max_length(self): max_length = 384 dataset = self._test_load_dataset(False, False, True, max_length=max_length) From fb01ab2188788f9e0c0abfb61bfd6b83f198f774 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 5 Nov 2024 17:19:51 +0100 Subject: [PATCH 10/12] divide datasets tests --- .github/workflows/test_utils.yml | 13 +++++++++++-- pyproject.toml | 1 + tests/utils/test_task_processors.py | 10 ++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml index 4e4d2428967..4e849ca3178 100644 --- a/.github/workflows/test_utils.yml +++ b/.github/workflows/test_utils.yml @@ -32,9 +32,18 @@ jobs: run: | python -m pip install --upgrade pip pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install .[tests] datasets + pip install .[tests] - name: Test with pytest working-directory: tests run: | - python -m pytest -s -vvvv utils + pytest utils -s -n auto -m "not datasets_test" --durations=0 + + - name: Install datasets + run: | + pip install datasets + + - name: Tests needing datasets + working-directory: tests + run: | + pytest utils -s -n auto -m "datasets_test" --durations=0 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 99a0f1c85fa..17bcd90e066 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ markers = [ "rocm_ep_test", "tensorflow_test", "timm_test", + "datasets_test", "run_in_series", "run_slow", "accelerate_test", diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py index 8337d3e477a..5ffeab07b27 100644 --- a/tests/utils/test_task_processors.py +++ b/tests/utils/test_task_processors.py @@ -19,6 +19,7 @@ from typing import TYPE_CHECKING, Any, Dict, Tuple, Union from unittest import TestCase +import pytest from datasets import DatasetDict from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer @@ -124,6 +125,7 @@ def test_create_defaults_and_kwargs_from_preprocessor_kwargs_does_not_mutate_pre self.assertDictEqual(preprocessor_kwargs, clone) @require_datasets + @pytest.mark.datasets_test def test_load_dataset_unallowed_data_keys(self): task_processor = TaskProcessorsManager.get_task_processor_class_for_task(self.TASK_NAME)( self.CONFIG, self.PREPROCESSOR @@ -191,18 +193,22 @@ def _test_load_dataset( return dataset @require_datasets + @pytest.mark.datasets_test def test_load_dataset(self): return self._test_load_dataset(False, False, False) @require_datasets + @pytest.mark.datasets_test def test_load_dataset_by_guessing_data_keys(self): return self._test_load_dataset(False, True, False) @require_datasets + @pytest.mark.datasets_test def test_load_dataset_and_only_keep_necessary_columns(self): return self._test_load_dataset(False, False, True) @require_datasets + @pytest.mark.datasets_test def test_load_default_dataset(self): return self._test_load_dataset(True, False, False) @@ -214,6 +220,7 @@ class TextClassificationProcessorTest(TestCase, TaskProcessorTestBase): WRONG_PREPROCESSOR = IMAGE_PROCESSOR @require_datasets + @pytest.mark.datasets_test def test_load_dataset_with_max_length(self): max_length = random.randint(4, 16) dataset = self._test_load_dataset(False, False, True, max_length=max_length) @@ -231,6 +238,7 @@ class TokenClassificationProcessorTest(TestCase, TaskProcessorTestBase): WRONG_PREPROCESSOR = IMAGE_PROCESSOR @require_datasets + @pytest.mark.datasets_test def test_load_dataset_with_max_length(self): max_length = random.randint(4, 16) dataset = self._test_load_dataset(False, False, True, max_length=max_length) @@ -241,6 +249,7 @@ def test_load_dataset_with_max_length(self): self.assertEqual(len(input_ids), max_length) @require_datasets + @pytest.mark.datasets_test def test_load_default_dataset(self): self.skipTest( "Skipping so as not to execute conll2003 remote code (test would require trust_remote_code=True)" @@ -254,6 +263,7 @@ class QuestionAnsweringProcessorTest(TestCase, TaskProcessorTestBase): WRONG_PREPROCESSOR = IMAGE_PROCESSOR @require_datasets + @pytest.mark.datasets_test def test_load_dataset_with_max_length(self): max_length = 384 dataset = self._test_load_dataset(False, False, True, max_length=max_length) From d9e9e4af9e6f98d1a16b308acbb7a812f9737e8a Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 15 Nov 2024 15:24:11 +0100 Subject: [PATCH 11/12] fix --- optimum/utils/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index db7d1f6975d..9afcdcb9117 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -33,6 +33,7 @@ check_if_transformers_greater, is_accelerate_available, is_auto_gptq_available, + is_datasets_available, is_diffusers_available, is_onnx_available, is_onnxruntime_available, From ee90d064253e40e309d8e8dfa0a341c70e44419f Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 15 Nov 2024 15:39:49 +0100 Subject: [PATCH 12/12] import datasets only when needed --- tests/utils/test_task_processors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py index 5ffeab07b27..1a9f352a79f 100644 --- a/tests/utils/test_task_processors.py +++ b/tests/utils/test_task_processors.py @@ -20,9 +20,9 @@ from unittest import TestCase import pytest -from datasets import DatasetDict from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer +from optimum.utils.import_utils import is_datasets_available from optimum.utils.preprocessing import TaskProcessorsManager from optimum.utils.testing_utils import require_datasets @@ -31,6 +31,9 @@ from transformers import PretrainedConfig, PreTrainedTokenizerBase from transformers.image_processing_utils import BaseImageProcessor +if is_datasets_available(): + from datasets import DatasetDict + TEXT_MODEL_NAME = "bert-base-uncased" CONFIG = AutoConfig.from_pretrained(TEXT_MODEL_NAME)