From 5261b8fcaeb01df52ed3c31fafd295db1b65fd62 Mon Sep 17 00:00:00 2001 From: Peter Shevcnenko <57573631+MorrisNein@users.noreply.github.com> Date: Fri, 30 Jun 2023 18:35:35 +0300 Subject: [PATCH] Refactor data storage (#15) * refactor dataset classes, use openml cache * fix example select_similar_datasets_by_knn.py * create DatasetIDType * create PredictorType * remove DataManager, refactor cache * update tests & test data * allow explicit OpenMLDataset creation from name/search * adapt examples to the last changes --- .gitignore | 2 +- .../0_loading_data/load_list_of_datasests.py | 5 +- .../extract_with_load_on_demand.py | 5 +- .../load_and_extract_features_sequentially.py | 4 +- .../select_similar_datasets_by_knn.py | 5 +- .../advise_models_from_similar_datasets.py | 11 +- examples/knowledge_base_loading.py | 6 +- meta_automl/data_preparation/data_manager.py | 59 -- meta_automl/data_preparation/dataset.py | 64 -- .../data_preparation/dataset/__init__.py | 3 + .../dataset/custom_dataset.py | 30 + .../data_preparation/dataset/dataset_base.py | 40 + .../dataset/openml_dataset.py | 39 + .../datasets_loaders/__init__.py | 2 +- .../datasets_loaders/datasets_loader.py | 16 +- .../openml_datasets_loader.py | 66 +- .../data_preparation/file_system/__init__.py | 5 + .../data_preparation/file_system/cache.py | 95 +++ .../file_system/cache_properties.py | 21 + .../file_system/file_system.py | 27 + .../meta_features_extractor.py | 22 +- .../pymfe_extractor.py | 31 +- meta_automl/data_preparation/model.py | 9 +- .../models_loaders/fedot_pipelines_loader.py | 48 +- .../knowledge_base_models_loader.py | 21 +- .../model_based_similarity_assessors.py | 9 +- .../model_advisors/model_advisor.py | 7 +- requirements.txt | Bin 430 -> 460 bytes test/conftest.py | 40 + test/constants.py | 7 +- test/data/datasets/australian.pkl | Bin 41870 -> 0 bytes test/data/datasets/monks-problems-1.pkl | Bin 16009 -> 0 bytes .../pymfe/334.pkl} | Bin .../pymfe/40981.pkl} | Bin .../org/openml/www/datasets/333/dataset.arff | 651 ++++++++++++++++ .../www/datasets/333/dataset_333.pkl.py3 | Bin 0 -> 5724 bytes .../openml/www/datasets/333/dataset_333.pq | Bin 0 -> 6016 bytes .../openml/www/datasets/333/description.xml | 33 + .../org/openml/www/datasets/333/features.xml | 84 +++ .../openml/www/datasets/333/features.xml.pkl | Bin 0 -> 509 bytes .../openml/www/datasets/40981/dataset.arff | 707 ++++++++++++++++++ .../www/datasets/40981/dataset_40981.pkl.py3 | Bin 0 -> 17678 bytes .../www/datasets/40981/dataset_40981.pq | Bin 0 -> 20170 bytes .../openml/www/datasets/40981/description.xml | 49 ++ .../openml/www/datasets/40981/features.xml | 175 +++++ .../www/datasets/40981/features.xml.pkl | Bin 0 -> 899 bytes test/data_manager.py | 9 - test/general_checks.py | 25 - test/unit/datasets/__init__.py | 0 test/unit/datasets/conftest.py | 18 + test/unit/datasets/general_checks.py | 24 + test/unit/datasets/test_custom_dataset.py | 48 ++ test/unit/datasets/test_datasets_loaders.py | 24 + test/unit/datasets/test_file_dataset.py | 48 ++ test/unit/datasets/test_openml_dataset.py | 27 + test/unit/test_dataset.py | 40 - test/unit/test_datasets_loaders.py | 50 -- test/unit/test_file_system.py | 7 + test/unit/test_meta_features_extractors.py | 47 +- 59 files changed, 2350 insertions(+), 415 deletions(-) delete mode 100644 meta_automl/data_preparation/data_manager.py delete mode 100644 meta_automl/data_preparation/dataset.py create mode 100644 meta_automl/data_preparation/dataset/__init__.py create mode 100644 meta_automl/data_preparation/dataset/custom_dataset.py create mode 100644 meta_automl/data_preparation/dataset/dataset_base.py create mode 100644 meta_automl/data_preparation/dataset/openml_dataset.py create mode 100644 meta_automl/data_preparation/file_system/__init__.py create mode 100644 meta_automl/data_preparation/file_system/cache.py create mode 100644 meta_automl/data_preparation/file_system/cache_properties.py create mode 100644 meta_automl/data_preparation/file_system/file_system.py create mode 100644 test/conftest.py delete mode 100644 test/data/datasets/australian.pkl delete mode 100644 test/data/datasets/monks-problems-1.pkl rename test/data/{pymfe/monks-problems-2.pkl => metafeatures/pymfe/334.pkl} (100%) rename test/data/{pymfe/australian.pkl => metafeatures/pymfe/40981.pkl} (100%) create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/dataset.arff create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3 create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/description.xml create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/features.xml create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/dataset.arff create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3 create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/description.xml create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/features.xml create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl delete mode 100644 test/data_manager.py delete mode 100644 test/general_checks.py create mode 100644 test/unit/datasets/__init__.py create mode 100644 test/unit/datasets/conftest.py create mode 100644 test/unit/datasets/general_checks.py create mode 100644 test/unit/datasets/test_custom_dataset.py create mode 100644 test/unit/datasets/test_datasets_loaders.py create mode 100644 test/unit/datasets/test_file_dataset.py create mode 100644 test/unit/datasets/test_openml_dataset.py delete mode 100644 test/unit/test_dataset.py delete mode 100644 test/unit/test_datasets_loaders.py create mode 100644 test/unit/test_file_system.py diff --git a/.gitignore b/.gitignore index 9e584fd4..a5f9134a 100644 --- a/.gitignore +++ b/.gitignore @@ -129,4 +129,4 @@ dmypy.json .pyre/ # User data -data/ +/data diff --git a/examples/0_loading_data/load_list_of_datasests.py b/examples/0_loading_data/load_list_of_datasests.py index c2ee1cbb..741438e1 100644 --- a/examples/0_loading_data/load_list_of_datasests.py +++ b/examples/0_loading_data/load_list_of_datasests.py @@ -6,9 +6,8 @@ def get_datasets(): 'nomao', 'sylvine', 'kc1', 'jungle_chess_2pcs_raw_endgame_complete', 'credit-g', 'delta_ailerons', 'pol' ] datasets_loader = OpenMLDatasetsLoader() - datasets = datasets_loader.load(dataset_names) - print(f'Datasets "{", ".join(dataset_names)}" are available at the paths:') - print('\n'.join(str(d) for d in datasets)) + datasets = datasets_loader.load(dataset_names, allow_names=True) + print(f'Datasets "{", ".join(dataset_names)}" are downloaded.') return datasets diff --git a/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py b/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py index 9519e6ca..ad2110a2 100644 --- a/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py +++ b/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py @@ -1,3 +1,5 @@ +import openml + from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor @@ -6,8 +8,9 @@ def main(): dataset_names = [ 'nomao', 'sylvine' ] + dataset_ids = [openml.datasets.get_dataset(name, download_data=False, download_qualities=False).dataset_id for name in dataset_names] extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader()) - meta_features = extractor.extract(dataset_names) + meta_features = extractor.extract(dataset_ids) return meta_features diff --git a/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py b/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py index f1d21cf4..cda8b804 100644 --- a/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py +++ b/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py @@ -9,8 +9,8 @@ def main(): loader = OpenMLDatasetsLoader() extractor = PymfeExtractor(extractor_params={'groups': 'general'}) - cached_datasets = loader.load(dataset_names) - meta_features = extractor.extract(cached_datasets) + datasets = loader.load(dataset_names, allow_names=True) + meta_features = extractor.extract(datasets) return meta_features diff --git a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py index b6f2bb8c..5f13201e 100644 --- a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py +++ b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py @@ -8,9 +8,10 @@ def main(): # Define datasets. dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing'] + datasets = OpenMLDatasetsLoader().load(dataset_names, allow_names=True) # Extract meta-features and load on demand. - extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader()) - meta_features = extractor.extract(dataset_names) + extractor = PymfeExtractor(extractor_params={'groups': 'general'}) + meta_features = extractor.extract(datasets) # Preprocess meta-features, as KNN does not support NaNs. meta_features = meta_features.dropna(axis=1, how='any') # Split datasets to train (preprocessing) and test (actual meta-algorithm objects). diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py b/examples/4_advising_models/advise_models_from_similar_datasets.py index 37c3b2db..e1dc16aa 100644 --- a/examples/4_advising_models/advise_models_from_similar_datasets.py +++ b/examples/4_advising_models/advise_models_from_similar_datasets.py @@ -2,7 +2,7 @@ from golem.core.optimisers.fitness import SingleObjFitness from sklearn.model_selection import train_test_split -from meta_automl.data_preparation.dataset import DatasetCache +from meta_automl.data_preparation.dataset import OpenMLDataset from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor from meta_automl.data_preparation.model import Model @@ -13,9 +13,10 @@ def main(): # Define datasets. dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing'] + datasets = OpenMLDatasetsLoader().load(dataset_names, allow_names=True) # Extract meta-features and load on demand. - extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader()) - meta_features = extractor.extract(dataset_names) + extractor = PymfeExtractor(extractor_params={'groups': 'general'}) + meta_features = extractor.extract(datasets) # Preprocess meta-features, as KNN does not support NaNs. meta_features = meta_features.dropna(axis=1, how='any') # Split datasets to train (preprocessing) and test (actual meta-algorithm objects). @@ -29,8 +30,8 @@ def main(): PipelineBuilder().add_node('normalization').add_node('logit').build(), PipelineBuilder().add_node('rf').add_node('logit').build() ] - best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', DatasetCache(dataset_name))] - for dataset_name, pipeline in zip(y_train, best_pipelines)] + best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', OpenMLDataset(dataset_id))] + for dataset_id, pipeline in zip(y_train, best_pipelines)] dataset_names_to_best_pipelines = dict(zip(y_train, best_models)) advisor = DiverseFEDOTPipelineAdvisor(assessor, minimal_distance=2).fit(dataset_names_to_best_pipelines) diff --git a/examples/knowledge_base_loading.py b/examples/knowledge_base_loading.py index 699a547f..310b4bdf 100644 --- a/examples/knowledge_base_loading.py +++ b/examples/knowledge_base_loading.py @@ -16,12 +16,12 @@ # ===== Another way to get train models, but also group them by datasets: models_for_train = {} - for dataset_name in train_datasets['dataset_name']: + for dataset_id in train_datasets['dataset_id']: dataset_models = models_loader.load( - dataset_names=[dataset_name], # load models just for this exact dataset. + dataset_ids=[dataset_id], # load models just for this exact dataset. fitness_metric='logloss', # must correspond to a metric name in a knowledge base. ) - models_for_train[dataset_name] = dataset_models + models_for_train[dataset_id] = dataset_models # If you need to load data to the local storage # dataset = OpenMLDatasetsLoader().load_single(dataset_name) diff --git a/meta_automl/data_preparation/data_manager.py b/meta_automl/data_preparation/data_manager.py deleted file mode 100644 index 0a743e28..00000000 --- a/meta_automl/data_preparation/data_manager.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import annotations - -import pickle -from os import PathLike -from pathlib import Path -from typing import Dict, Any, Union - -PathType = Union[PathLike, str] -DEFAULT_CACHE_EXTENSION = '.pkl' - - -class DataManager: - - @classmethod - def get_dataset_cache_path(cls, dataset_name: str) -> Path: - return cls.get_datasets_dir().joinpath(dataset_name).with_suffix(DEFAULT_CACHE_EXTENSION) - - @classmethod - def get_datasets_dir(cls) -> Path: - datasets_dir = cls.get_data_dir().joinpath('datasets') - return cls.ensure_dir_exists(datasets_dir) - - @classmethod - def get_data_dir(cls) -> Path: - data_dir = cls.get_project_root().joinpath('data') - return cls.ensure_dir_exists(data_dir) - - @classmethod - def ensure_dir_exists(cls, dir_: Path) -> Path: - if not dir_.exists(): - dir_.mkdir() - return dir_ - - @classmethod - def get_project_root(cls) -> Path: - """Returns project root folder.""" - return Path(__file__).parents[2] - - @classmethod - def get_meta_features_cache_path(cls, dataset_name: str, source_name: str): - meta_features_dir = cls.ensure_dir_exists(cls.get_data_dir().joinpath(source_name)) - return meta_features_dir.joinpath(dataset_name).with_suffix('.pkl') - - @classmethod - def get_meta_features_dict(cls, dataset_name: str, source_name: str) -> Dict[str, Any]: - meta_features_file = cls.get_meta_features_cache_path(dataset_name, source_name) - if not meta_features_file.exists(): - return {} - with open(meta_features_file, 'rb') as f: - meta_features = pickle.load(f) - return meta_features - - @classmethod - def update_meta_features_dict(cls, dataset_name: str, source_name: str, meta_features: Dict[str, Any]): - meta_features_file = cls.get_meta_features_cache_path(dataset_name, source_name) - meta_features_old = cls.get_meta_features_dict(dataset_name, source_name) - with open(meta_features_file, 'wb') as f: - meta_features_old.update(meta_features) - pickle.dump(meta_features, f) diff --git a/meta_automl/data_preparation/dataset.py b/meta_automl/data_preparation/dataset.py deleted file mode 100644 index 23dda83c..00000000 --- a/meta_automl/data_preparation/dataset.py +++ /dev/null @@ -1,64 +0,0 @@ -from __future__ import annotations - -import pickle -from dataclasses import dataclass -from pathlib import Path -from typing import Union, Optional, List - -import numpy as np -import pandas as pd -import scipy as sp - -from meta_automl.data_preparation.data_manager import DataManager - - -class NoCacheError(FileNotFoundError): - pass - - -@dataclass -class DatasetCache: - name: str - _cache_path: Optional[Path] = None - _id: Optional[int] = None - - @property - def id(self): - return self._id or self.name - - @property - def cache_path(self): - return self._cache_path or DataManager.get_dataset_cache_path(self.name) - - @cache_path.setter - def cache_path(self, val): - self._cache_path = val - - def from_cache(self) -> Dataset: - if not self.cache_path.exists(): - raise NoCacheError(f'Dataset {self.name} not found!') - with open(self.cache_path, 'rb') as f: - dataset = pickle.load(f) - dataset.cache_path = self.cache_path - return dataset - - -@dataclass -class Dataset: - name: str - x: Union[np.ndarray, pd.DataFrame, sp.sparse.csr_matrix] - y: Optional[Union[np.ndarray, pd.DataFrame]] = None - categorical_indicator: Optional[List[bool]] = None - attribute_names: Optional[List[str]] = None - cache_path: Optional[Path] = None - _id: Optional[int] = None - - def dump_to_cache(self, cache_path: Optional[Path] = None) -> DatasetCache: - cache_path = cache_path or self.cache_path - with open(cache_path, 'wb') as f: - pickle.dump(self, f) - return DatasetCache(self.name, cache_path, self.id) - - @property - def id(self): - return self._id or self.name diff --git a/meta_automl/data_preparation/dataset/__init__.py b/meta_automl/data_preparation/dataset/__init__.py new file mode 100644 index 00000000..62c0a37d --- /dev/null +++ b/meta_automl/data_preparation/dataset/__init__.py @@ -0,0 +1,3 @@ +from .dataset_base import DatasetBase, DatasetData, DatasetIDType +from .custom_dataset import DataNotFoundError, CustomDataset +from .openml_dataset import OpenMLDataset, OpenMLDatasetIDType diff --git a/meta_automl/data_preparation/dataset/custom_dataset.py b/meta_automl/data_preparation/dataset/custom_dataset.py new file mode 100644 index 00000000..505868f6 --- /dev/null +++ b/meta_automl/data_preparation/dataset/custom_dataset.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import pickle +from pathlib import Path +from typing import Optional + +from meta_automl.data_preparation.dataset import DatasetBase +from meta_automl.data_preparation.dataset.dataset_base import DatasetData + + + +class DataNotFoundError(FileNotFoundError): + pass + + +class CustomDataset(DatasetBase): + + def get_data(self, cache_path: Optional[Path] = None) -> DatasetData: + cache_path = cache_path or self.cache_path + if not cache_path.exists(): + raise DataNotFoundError(f'Dataset {self} is missing by the path "{cache_path}".') + with open(cache_path, 'rb') as f: + dataset_data = pickle.load(f) + return dataset_data + + def dump_data(self, dataset_data: DatasetData, cache_path: Optional[Path] = None) -> CustomDataset: + cache_path = cache_path or self.cache_path + with open(cache_path, 'wb') as f: + pickle.dump(dataset_data, f) + return self diff --git a/meta_automl/data_preparation/dataset/dataset_base.py b/meta_automl/data_preparation/dataset/dataset_base.py new file mode 100644 index 00000000..fd84dee5 --- /dev/null +++ b/meta_automl/data_preparation/dataset/dataset_base.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from abc import abstractmethod, ABC +from dataclasses import dataclass +from pathlib import Path +from typing import Union, Optional, List, Any + +import numpy as np +import pandas as pd +import scipy as sp + +from meta_automl.data_preparation.file_system import CacheOperator, get_dataset_cache_path + +DatasetIDType = Any + + +@dataclass +class DatasetData: + x: Union[np.ndarray, pd.DataFrame, sp.sparse.csr_matrix] + y: Optional[Union[np.ndarray, pd.DataFrame]] = None + categorical_indicator: Optional[List[bool]] = None + attribute_names: Optional[List[str]] = None + + +class DatasetBase(ABC, CacheOperator): + + def __init__(self, id_: DatasetIDType, name: Optional[str] = None): + self.id_ = id_ + self.name = name + + def __repr__(self): + return f'{self.__class__.__name__}(id_={self.id_}, name={self.name})' + + @abstractmethod + def get_data(self) -> DatasetData: + raise NotImplementedError() + + @property + def cache_path(self) -> Path: + return get_dataset_cache_path(self) diff --git a/meta_automl/data_preparation/dataset/openml_dataset.py b/meta_automl/data_preparation/dataset/openml_dataset.py new file mode 100644 index 00000000..08fc5c1d --- /dev/null +++ b/meta_automl/data_preparation/dataset/openml_dataset.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from typing import Union + +import openml + +from meta_automl.data_preparation.dataset import DatasetBase +from meta_automl.data_preparation.dataset.dataset_base import DatasetData +from meta_automl.data_preparation.file_system import update_openml_cache_dir + +OpenMLDatasetIDType = int + +update_openml_cache_dir() + + +class OpenMLDataset(DatasetBase): + + def __init__(self, id_: OpenMLDatasetIDType): + if isinstance(id_, str): + raise ValueError('Creating OpenMLDataset by dataset name is ambiguous. Please, use dataset id.' + f'Otherwise, you can perform search by f{self.__class__.__name__}.from_search().') + self._openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False, + error_if_multiple=True) + id_ = self._openml_dataset.id + name = self._openml_dataset.name + super().__init__(id_, name) + + @classmethod + def from_search(cls, id_: Union[OpenMLDatasetIDType, str], **get_dataset_kwargs) -> OpenMLDataset: + openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False, + **get_dataset_kwargs) + return cls(openml_dataset.id) + + def get_data(self, dataset_format: str = 'dataframe') -> DatasetData: + X, y, categorical_indicator, attribute_names = self._openml_dataset.get_data( + target=self._openml_dataset.default_target_attribute, + dataset_format=dataset_format + ) + return DatasetData(X, y, categorical_indicator, attribute_names) diff --git a/meta_automl/data_preparation/datasets_loaders/__init__.py b/meta_automl/data_preparation/datasets_loaders/__init__.py index 3908c8e0..4b91c8aa 100644 --- a/meta_automl/data_preparation/datasets_loaders/__init__.py +++ b/meta_automl/data_preparation/datasets_loaders/__init__.py @@ -1,2 +1,2 @@ from .datasets_loader import DatasetsLoader -from .openml_datasets_loader import OpenMLDatasetsLoader, OpenMLDatasetID +from .openml_datasets_loader import OpenMLDatasetsLoader diff --git a/meta_automl/data_preparation/datasets_loaders/datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/datasets_loader.py index 8faba6d0..ab6ffa6c 100644 --- a/meta_automl/data_preparation/datasets_loaders/datasets_loader.py +++ b/meta_automl/data_preparation/datasets_loaders/datasets_loader.py @@ -1,25 +1,17 @@ from __future__ import annotations from abc import abstractmethod -from typing import List, Type +from typing import List -from meta_automl.data_preparation.data_manager import DataManager -from meta_automl.data_preparation.dataset import Dataset, DatasetCache, NoCacheError +from meta_automl.data_preparation.dataset import DatasetBase class DatasetsLoader: - data_manager: Type[DataManager] = DataManager @abstractmethod - def load(self, *args, **kwargs) -> List[DatasetCache]: + def load(self, *args, **kwargs) -> List[DatasetBase]: raise NotImplementedError() @abstractmethod - def load_single(self, *args, **kwargs) -> DatasetCache: + def load_single(self, *args, **kwargs) -> DatasetBase: raise NotImplementedError() - - def cache_to_memory(self, dataset: DatasetCache) -> Dataset: - try: - return dataset.from_cache() - except NoCacheError: - return self.load_single(dataset.id).from_cache() diff --git a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py index 7959ca61..11294c45 100644 --- a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py +++ b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py @@ -1,57 +1,43 @@ from __future__ import annotations -import shutil -from pathlib import Path -from typing import List, Union +from typing import List, Union, Optional -import openml +from golem.core.log import default_log -from meta_automl.data_preparation.dataset import DatasetCache, Dataset +from meta_automl.data_preparation.dataset import OpenMLDataset, OpenMLDatasetIDType from meta_automl.data_preparation.datasets_loaders import DatasetsLoader -OpenMLDatasetID = Union[str, int] - - -def _clear_openml_cache(): - cache_dir = openml.config.get_cache_directory() - cache_dir = Path(cache_dir) - shutil.rmtree(cache_dir) - class OpenMLDatasetsLoader(DatasetsLoader): + def __init__(self, allow_names: bool = False): + self.dataset_ids = [] + self._allow_names = allow_names - def __init__(self): - self.dataset_sources = [] - - def load(self, dataset_sources: List[OpenMLDatasetID]) -> List[DatasetCache]: - self.dataset_sources = dataset_sources + def load(self, dataset_ids: List[Union[OpenMLDatasetIDType, str]], + allow_names: Optional[bool] = None) -> List[OpenMLDataset]: + self.dataset_ids += dataset_ids + allow_names = self._allow_names if allow_names is None else allow_names datasets = [] # TODO: Optimize like this # https://github.com/openml/automlbenchmark/commit/a09dc8aee96178dd14837d9e1cd519d1ec63f804 - for source in self.dataset_sources: - dataset = self.load_single(source) + for dataset_id in self.dataset_ids: + dataset = self.load_single(dataset_id, allow_name=allow_names) datasets.append(dataset) return datasets - def load_single(self, source: OpenMLDatasetID): - try: - return self.get_openml_dataset(source) - finally: - _clear_openml_cache() - - def get_openml_dataset(self, dataset_id: OpenMLDatasetID, force_download: bool = False) -> DatasetCache: - openml_dataset = openml.datasets.get_dataset(dataset_id, download_data=False, download_qualities=False) - name = openml_dataset.name.lower() - dataset_cache_path = self.data_manager.get_dataset_cache_path(name) - if dataset_cache_path.exists() and not force_download: - dataset_cache = DatasetCache(name, dataset_cache_path) + def load_single(self, dataset_id: Union[OpenMLDatasetIDType, str], + allow_name: Optional[bool] = None) -> OpenMLDataset: + allow_name = self._allow_names if allow_name is None else allow_name + + if allow_name: + dataset = OpenMLDataset.from_search(dataset_id) else: - dataset_id = openml_dataset.id - X, y, categorical_indicator, attribute_names = openml_dataset.get_data( - target=openml_dataset.default_target_attribute, - dataset_format='array' - ) - dataset = Dataset(name, X, y, categorical_indicator, attribute_names, _id=dataset_id) - dataset_cache = dataset.dump_to_cache(dataset_cache_path) - return dataset_cache + dataset = OpenMLDataset(dataset_id) + + self.dataset_ids.append(dataset.id_) + return dataset + + @property + def _log(self): + return default_log(self) diff --git a/meta_automl/data_preparation/file_system/__init__.py b/meta_automl/data_preparation/file_system/__init__.py new file mode 100644 index 00000000..a228da6e --- /dev/null +++ b/meta_automl/data_preparation/file_system/__init__.py @@ -0,0 +1,5 @@ +from meta_automl.data_preparation.file_system.file_system import PathType, get_project_root, get_data_dir +from meta_automl.data_preparation.file_system.cache import (CacheOperator, get_dataset_cache_path, + get_dataset_cache_path_by_id, get_meta_features_cache_path, + get_local_meta_features, update_local_meta_features, + get_openml_cache_dir, update_openml_cache_dir) diff --git a/meta_automl/data_preparation/file_system/cache.py b/meta_automl/data_preparation/file_system/cache.py new file mode 100644 index 00000000..99daf965 --- /dev/null +++ b/meta_automl/data_preparation/file_system/cache.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +import pickle +from pathlib import Path + +from typing import Type, Any, Dict, TYPE_CHECKING + +import openml + +from meta_automl.data_preparation.file_system.cache_properties import CacheProperties, CacheType +from meta_automl.data_preparation.file_system.file_system import get_data_dir, ensure_dir_exists + +if TYPE_CHECKING: + from meta_automl.data_preparation.dataset import DatasetBase + from meta_automl.data_preparation.meta_features_extractors import MetaFeaturesExtractor + + +class CacheOperator: + pass + + +def get_openml_cache_dir() -> Path: + return get_data_dir().joinpath('openml_cache') + + +def get_full_openml_cache_dir() -> Path: + return get_data_dir().joinpath('openml_cache/org/openml/www') + + +def update_openml_cache_dir(): + openml_cache_path = str(get_openml_cache_dir()) + openml.config.set_cache_directory(openml_cache_path) + + +def _get_cache_path(object_class: Type[CacheOperator], object_id: str, _create_parent_dir: bool = True) -> Path: + cache_properties = get_cache_properties(object_class.__name__) + directory = cache_properties.dir_ + path = cache_properties.template.format(id_=object_id) + path = directory.joinpath(path) + if _create_parent_dir: + ensure_dir_exists(directory) + return path + + +def get_dataset_cache_path(dataset: DatasetBase) -> Path: + class_ = dataset.__class__ + id_ = dataset.id_ + return _get_cache_path(class_, str(id_)) + + +def get_dataset_cache_path_by_id(class_: Type[DatasetBase], id_: Any) -> Path: + return _get_cache_path(class_, str(id_)) + + +def get_meta_features_cache_path(extractor_class: Type[MetaFeaturesExtractor], dataset_id: Any) -> Path: + return _get_cache_path(extractor_class, str(dataset_id)) + + +def get_local_meta_features(extractor_class: Type[MetaFeaturesExtractor], dataset_id: Any) -> Dict[str, Any]: + meta_features_file = get_meta_features_cache_path(extractor_class, dataset_id) + if not meta_features_file.exists(): + return {} + with open(meta_features_file, 'rb') as f: + meta_features = pickle.load(f) + return meta_features + + +def update_local_meta_features(extractor_class: Type[MetaFeaturesExtractor], + dataset_id: Any, meta_features: Dict[str, Any]): + meta_features_file = get_meta_features_cache_path(extractor_class, dataset_id) + meta_features_old = get_local_meta_features(extractor_class, dataset_id) + with open(meta_features_file, 'wb') as f: + meta_features_old.update(meta_features) + pickle.dump(meta_features_old, f) + + +def get_cache_properties(class_name: str) -> CacheProperties: + cache_properties_by_class_name = { + 'OpenMLDataset': CacheProperties( + type_=CacheType.directory, + dir_=get_full_openml_cache_dir().joinpath('datasets'), + template='{id_}'), + 'CustomDataset': CacheProperties( + type_=CacheType.file, + dir_=get_data_dir().joinpath('datasets/custom_dataset'), + template='{id_}.pkl'), + 'PymfeExtractor': CacheProperties( + type_=CacheType.file, + dir_=get_data_dir().joinpath('metafeatures/pymfe'), + template='{id_}.pkl'), + } + try: + return cache_properties_by_class_name[class_name] + except KeyError as e: + raise KeyError(f'Cache properties for the class {class_name} are not defined.').with_traceback(e.__traceback__) diff --git a/meta_automl/data_preparation/file_system/cache_properties.py b/meta_automl/data_preparation/file_system/cache_properties.py new file mode 100644 index 00000000..7374df08 --- /dev/null +++ b/meta_automl/data_preparation/file_system/cache_properties.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from meta_automl.data_preparation.file_system import PathType + + +class CacheType(Enum): + file = 'file' + directory = 'directory' + + +@dataclass +class CacheProperties: + type_: Optional[CacheType] = None + dir_: Optional[Path] = None + template: Optional[PathType] = None diff --git a/meta_automl/data_preparation/file_system/file_system.py b/meta_automl/data_preparation/file_system/file_system.py new file mode 100644 index 00000000..ff2c3743 --- /dev/null +++ b/meta_automl/data_preparation/file_system/file_system.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from os import PathLike +from pathlib import Path +from typing import Union + +PathType = Union[PathLike, str] + +DATA_SUBDIR = 'data' + + +def ensure_dir_exists(dir_: Path) -> Path: + if dir_.is_file(): + dir_ = dir_.parent + if not dir_.exists(): + dir_.mkdir(parents=True) + return dir_ + + +def get_project_root() -> Path: + """Returns project root folder.""" + return Path(__file__).parents[3] + + +def get_data_dir() -> Path: + data_dir = get_project_root().joinpath(DATA_SUBDIR) + return data_dir diff --git a/meta_automl/data_preparation/meta_features_extractors/meta_features_extractor.py b/meta_automl/data_preparation/meta_features_extractors/meta_features_extractor.py index dc7ccf5a..d81e8cbd 100644 --- a/meta_automl/data_preparation/meta_features_extractors/meta_features_extractor.py +++ b/meta_automl/data_preparation/meta_features_extractors/meta_features_extractor.py @@ -1,28 +1,28 @@ from __future__ import annotations -from abc import abstractmethod -from typing import Optional, Iterable, Dict, Any, Type +from abc import abstractmethod, ABC +from typing import Optional, Iterable, Dict, Any import pandas as pd -from meta_automl.data_preparation.data_manager import DataManager +from meta_automl.data_preparation.dataset import DatasetIDType +from meta_automl.data_preparation.file_system import (CacheOperator, get_local_meta_features, + update_local_meta_features) -class MetaFeaturesExtractor: - DEFAULT_PARAMS: Optional[Dict[str, Any]] = None - SOURCE: Optional[str] = None - data_manager: Type[DataManager] = DataManager +class MetaFeaturesExtractor(ABC, CacheOperator): + default_params: Optional[Dict[str, Any]] = None @abstractmethod def extract(self, datasets) -> pd.DataFrame: raise NotImplementedError() - def _get_meta_features_cache(self, dataset_name: str, meta_feature_names: Iterable[str]): - cache = self.data_manager.get_meta_features_dict(dataset_name, self.SOURCE) + def _get_meta_features_cache(self, dataset_id: DatasetIDType, meta_feature_names: Iterable[str]): + cache = get_local_meta_features(self.__class__, str(dataset_id)) if set(meta_feature_names) ^ cache.keys(): return None else: return {mf_name: cache[mf_name] for mf_name in meta_feature_names} - def _update_meta_features_cache(self, dataset_name: str, meta_features_dict: Dict[str, Any]): - self.data_manager.update_meta_features_dict(dataset_name, self.SOURCE, meta_features_dict) + def _update_meta_features_cache(self, dataset_id: DatasetIDType, meta_features_dict: Dict[str, Any]): + update_local_meta_features(self.__class__, dataset_id, meta_features_dict) diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py index 8dbc728f..edfa6925 100644 --- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py +++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py @@ -6,17 +6,16 @@ from golem.core.log import default_log from pymfe.mfe import MFE -from meta_automl.data_preparation.dataset import DatasetCache +from meta_automl.data_preparation.dataset import DatasetBase, DatasetIDType from meta_automl.data_preparation.datasets_loaders import DatasetsLoader, OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import MetaFeaturesExtractor class PymfeExtractor(MetaFeaturesExtractor): - DEFAULT_PARAMS = {'groups': 'default'} - SOURCE = 'pymfe' + default_params = {'groups': 'default'} def __init__(self, extractor_params: Dict[str, Any] = None, datasets_loader: DatasetsLoader = None): - self.extractor_params = extractor_params if extractor_params is not None else self.DEFAULT_PARAMS + self.extractor_params = extractor_params if extractor_params is not None else self.default_params self._datasets_loader = datasets_loader or OpenMLDatasetsLoader() self._extractor = MFE(**self.extractor_params) self._logger = default_log(self) @@ -27,21 +26,21 @@ def datasets_loader(self) -> DatasetsLoader: raise ValueError("Datasets loader not provided!") return self._datasets_loader - def extract(self, datasets: List[Union[DatasetCache, str]], fill_input_nans: bool = False, - use_cached: bool = True, update_cached: bool = True) -> pd.DataFrame: + def extract(self, datasets_or_ids: List[Union[DatasetBase, DatasetIDType]], + fill_input_nans: bool = False, use_cached: bool = True, update_cached: bool = True) -> pd.DataFrame: meta_features = {} meta_feature_names = self._extractor.extract_metafeature_names() - load_dataset = self.datasets_loader.cache_to_memory - for dataset in datasets: - if isinstance(dataset, str): - dataset = DatasetCache(dataset) - self._logger.info(f'Extracting meta features of the dataset {dataset.name}...') + for dataset in datasets_or_ids: + if not isinstance(dataset, DatasetBase): + dataset = self._datasets_loader.load_single(dataset) + + self._logger.info(f'Extracting meta features of the dataset {dataset}...') if (use_cached and - (mfs := self._get_meta_features_cache(dataset.name, meta_feature_names))): - meta_features[dataset.name] = mfs + (mfs := self._get_meta_features_cache(dataset.id_, meta_feature_names))): + meta_features[dataset.id_] = mfs else: - loaded_dataset = load_dataset(dataset) + loaded_dataset = dataset.get_data(dataset_format='array') cat_cols = [i for i, val in enumerate(loaded_dataset.categorical_indicator) if val] x = loaded_dataset.x y = loaded_dataset.y @@ -51,8 +50,8 @@ def extract(self, datasets: List[Union[DatasetCache, str]], fill_input_nans: boo feature_names, dataset_features = mfe.extract(out_type=tuple) mfs = dict(zip(feature_names, dataset_features)) if update_cached: - self._update_meta_features_cache(dataset.name, mfs) - meta_features[dataset.name] = mfs + self._update_meta_features_cache(dataset.id_, mfs) + meta_features[dataset.id_] = mfs meta_features = pd.DataFrame.from_dict(meta_features, orient='index') return meta_features diff --git a/meta_automl/data_preparation/model.py b/meta_automl/data_preparation/model.py index 25de781c..d437ea24 100644 --- a/meta_automl/data_preparation/model.py +++ b/meta_automl/data_preparation/model.py @@ -3,13 +3,16 @@ from golem.core.optimisers.fitness import Fitness -from meta_automl.data_preparation.dataset import DatasetCache +from meta_automl.data_preparation.dataset import DatasetBase + + +PredictorType = Any @dataclass class Model: - predictor: Any + predictor: PredictorType fitness: Fitness fitness_metric_name: str - dataset_cache: DatasetCache + dataset: DatasetBase metadata: Dict[str, Any] = field(default_factory=dict) diff --git a/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py b/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py index ae7f0b38..599056fa 100644 --- a/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py +++ b/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py @@ -14,8 +14,8 @@ from golem.core.log import default_log from tqdm import tqdm -from meta_automl.data_preparation.data_manager import PathType -from meta_automl.data_preparation.dataset import DatasetCache +from meta_automl.data_preparation.file_system import PathType +from meta_automl.data_preparation.dataset import DatasetBase from meta_automl.data_preparation.datasets_loaders import DatasetsLoader, OpenMLDatasetsLoader from meta_automl.data_preparation.model import Model from meta_automl.data_preparation.models_loaders import ModelsLoader @@ -29,10 +29,9 @@ def evaluate_classification_fedot_pipeline(pipeline, input_data): return fitness -def get_n_best_fedot_performers(dataset_cache: DatasetCache, pipelines: List[Pipeline], datasets_loader: DatasetsLoader, - n_best: int = 1) -> List[Model]: - loaded_dataset = datasets_loader.cache_to_memory(dataset_cache) - X, y_test = loaded_dataset.x, loaded_dataset.y +def get_n_best_fedot_performers(dataset: DatasetBase, pipelines: List[Pipeline], n_best: int = 1) -> List[Model]: + data = dataset.get_data() + X, y_test = data.x, data.y input_data = InputData(idx=np.arange(0, len(X)), features=X, target=y_test, data_type=DataTypesEnum.table, task=Task(TaskTypesEnum.classification)) fitnesses = [] @@ -41,14 +40,14 @@ def get_n_best_fedot_performers(dataset_cache: DatasetCache, pipelines: List[Pip for pipeline in tqdm(pipelines, desc='Evaluating pipelines'): fitness = evaluate_classification_fedot_pipeline(pipeline, input_data) fitnesses.append(fitness) - models.append(Model(pipeline, fitness, metric_name, dataset_cache)) + models.append(Model(pipeline, fitness, metric_name, dataset)) best_models = [models.pop(np.argmax(fitnesses)) for _ in range(min(n_best, len(pipelines)))] return best_models class FEDOTPipelinesLoader(ModelsLoader): - def __init__(self, datasets_to_load: Union[List[Union[DatasetCache, str]], Literal['auto']] = 'auto', + def __init__(self, datasets_to_load: Union[List[Union[DatasetBase, str]], Literal['auto']] = 'auto', candidate_pipelines: Optional[List[List[Pipeline]]] = None, candidate_pipeline_paths: Optional[List[List[PathType]]] = None, launch_dir: Optional[PathType] = None, @@ -56,12 +55,12 @@ def __init__(self, datasets_to_load: Union[List[Union[DatasetCache, str]], Liter self.log = default_log(self) - self.datasets_loader = datasets_loader or OpenMLDatasetsLoader() + self.datasets_loader = datasets_loader or OpenMLDatasetsLoader(allow_names=True) self.launch_dir: Path = Path(launch_dir) if isinstance(launch_dir, str) else launch_dir - self._datasets: List[DatasetCache] = (self._define_datasets() if datasets_to_load == 'auto' - else self._dataset_names_to_cache(datasets_to_load)) + self._datasets: List[DatasetBase] = (self._define_datasets() if datasets_to_load == 'auto' + else self._get_datasets_from_names(datasets_to_load)) self.candidate_pipelines = candidate_pipelines @@ -71,8 +70,8 @@ def __init__(self, datasets_to_load: Union[List[Union[DatasetCache, str]], Liter def load(self, datasets: Union[List[str], Literal['auto']] = 'auto', n_best: int = 1) -> List[List[Model]]: if datasets != 'auto': - datasets = self._dataset_names_to_cache(datasets) - difference = set(d.name for d in datasets) - set(self.dataset_names) + datasets = self._get_datasets_from_names(datasets) + difference = set(d.name for d in datasets) - set(self.dataset_ids) if difference: raise ValueError(f'Results for these datasets are not available: {difference}.') else: @@ -89,10 +88,10 @@ def _define_pipeline_paths(self) -> List[List[Path]]: if not self.launch_dir: raise ValueError('Launch dir or model paths must be provided!') - dataset_names = self.dataset_names - datasets_models_paths = dict(zip(dataset_names, [[]] * len(dataset_names))) + dataset_ids = self.dataset_ids + datasets_models_paths = dict(zip(dataset_ids, [[]] * len(dataset_ids))) - for dataset_name in tqdm(dataset_names, desc='Defining model paths', unit='dataset'): + for dataset_name in tqdm(dataset_ids, desc='Defining model paths', unit='dataset'): for model_path in self.launch_dir.joinpath(dataset_name).glob(r'FEDOT*\*\*\launch_*.json'): datasets_models_paths[dataset_name].append(model_path) @@ -104,28 +103,27 @@ def _import_pipelines(self, candidate_pipeline_paths: List[List[PathType]]): desc='Importing pipelines', unit='dataset'): candidates_for_dataset = [Pipeline.from_serialized(str(p)) for p in paths] if not candidates_for_dataset: - self.log.warning(f'No pipelines found for the dataset "{dataset.name}".') + self.log.warning(f'No pipelines found for the dataset "{dataset}".') candidate_pipelines.append(candidates_for_dataset) self.candidate_pipelines = candidate_pipelines - def _define_datasets(self) -> List[DatasetCache]: + def _define_datasets(self) -> List[DatasetBase]: if not self.launch_dir: raise ValueError('Launch dir or datasets must be provided!') datasets = list({p.parents[2].name for p in self.launch_dir.glob(r'*\FEDOT*\*\launch_0')}) datasets.sort() - datasets = self._dataset_names_to_cache(datasets) + datasets = self._get_datasets_from_names(datasets) return datasets @property - def dataset_names(self): - return [d.name if isinstance(d, DatasetCache) else d for d in self._datasets] + def dataset_ids(self): + return [d.name if isinstance(d, DatasetBase) else d for d in self._datasets] - @staticmethod - def _dataset_names_to_cache(datasets: List[Union[str, DatasetCache]]) -> List[DatasetCache]: + def _get_datasets_from_names(self, datasets: List[Union[str, DatasetBase]]) -> List[DatasetBase]: new_list = [] for dataset in datasets: - if isinstance(dataset, str): - dataset = DatasetCache(dataset) + if not isinstance(dataset, DatasetBase): + dataset = self.datasets_loader.load_single(dataset) new_list.append(dataset) return new_list diff --git a/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py b/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py index e26b896e..7c38b9d8 100644 --- a/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py +++ b/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py @@ -7,12 +7,13 @@ from fedot.core.pipelines.pipeline import Pipeline from golem.core.optimisers.fitness import SingleObjFitness -from meta_automl.data_preparation.data_manager import DataManager -from meta_automl.data_preparation.dataset import DatasetCache + +from meta_automl.data_preparation.dataset import OpenMLDataset +from meta_automl.data_preparation.file_system import get_data_dir from meta_automl.data_preparation.model import Model from meta_automl.data_preparation.models_loaders import ModelsLoader -DEFAULT_KNOWLEDGE_BASE_PATH = DataManager.get_data_dir().joinpath('knowledge_base_0') +DEFAULT_KNOWLEDGE_BASE_PATH = get_data_dir().joinpath('knowledge_base_0') class KnowledgeBaseModelsLoader(ModelsLoader): @@ -21,21 +22,21 @@ def __init__(self, knowledge_base_path: Union[str, PathLike] = DEFAULT_KNOWLEDGE self.df_knowledge_base: Optional[pd.DataFrame] = None self.df_datasets: Optional[pd.DataFrame] = None - def load(self, dataset_names: Optional[Sequence[str]] = None, + def load(self, dataset_ids: Optional[Sequence[str]] = None, fitness_metric: str = 'f1') -> List[Model]: if self.df_knowledge_base is None: knowledge_base_split_file = self.knowledge_base_path.joinpath('knowledge_base.csv') self.df_knowledge_base = pd.read_csv(knowledge_base_split_file) - if dataset_names is None: - dataset_names = self.parse_datasets()['dataset_name'] + if dataset_ids is None: + dataset_ids = self.parse_datasets()['dataset_id'] df_knowledge_base = self.df_knowledge_base - df_knowledge_base = df_knowledge_base[df_knowledge_base['dataset_name'].isin(dataset_names)] + df_knowledge_base = df_knowledge_base[df_knowledge_base['dataset_id'].isin(dataset_ids)] cached_datasets = {} - for name in dataset_names: - cached_datasets[name] = DatasetCache(name) + for id_ in dataset_ids: + cached_datasets[id_] = OpenMLDataset(id_) models = [] for _, row in df_knowledge_base.iterrows(): @@ -45,7 +46,7 @@ def load(self, dataset_names: Optional[Sequence[str]] = None, metric_value = row[fitness_metric] fitness = SingleObjFitness(metric_value) metadata = dict(row) - dataset_cache = cached_datasets[row['dataset_name']] + dataset_cache = cached_datasets[row['dataset_id']] model = Model(predictor, fitness, fitness_metric, dataset_cache, metadata) models.append(model) return models diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py index 09720a1e..40008d00 100644 --- a/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py +++ b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py @@ -1,10 +1,11 @@ from abc import ABC -from typing import Optional, Dict, Any, List, Iterable +from typing import Optional, List, Iterable import numpy as np import pandas as pd from sklearn.neighbors import NearestNeighbors +from meta_automl.data_preparation.dataset import DatasetIDType from meta_automl.meta_algorithm.datasets_similarity_assessors.datasets_similarity_assessor import \ DatasetsSimilarityAssessor @@ -13,7 +14,7 @@ class ModelBasedSimilarityAssessor(ABC, DatasetsSimilarityAssessor): def __init__(self, model, n_best: int = 1): self._inner_model = model self.n_best = n_best - self._datasets: Optional[Iterable[str]] = None + self._datasets: Optional[Iterable[DatasetIDType]] = None class KNeighborsBasedSimilarityAssessor(ModelBasedSimilarityAssessor): @@ -21,7 +22,7 @@ def __init__(self, n_neighbors: int = 1, **model_params): model = NearestNeighbors(n_neighbors=n_neighbors, **model_params) super().__init__(model, n_neighbors) - def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]): + def fit(self, meta_features: pd.DataFrame, datasets: Iterable[DatasetIDType]): meta_features = self.preprocess_meta_features(meta_features) self._datasets = np.array(datasets) self._inner_model.fit(meta_features) @@ -30,7 +31,7 @@ def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]): def preprocess_meta_features(meta_features: pd.DataFrame) -> pd.DataFrame: return meta_features.dropna(axis=1, how='any') - def predict(self, meta_features: pd.DataFrame, return_distance: bool = False) -> Iterable[Iterable[str]]: + def predict(self, meta_features: pd.DataFrame, return_distance: bool = False) -> Iterable[Iterable[DatasetIDType]]: dataset_indexes = self._inner_model.kneighbors(meta_features, return_distance=return_distance) if return_distance: distances, dataset_indexes = dataset_indexes diff --git a/meta_automl/meta_algorithm/model_advisors/model_advisor.py b/meta_automl/meta_algorithm/model_advisors/model_advisor.py index a9ca0d97..c653a173 100644 --- a/meta_automl/meta_algorithm/model_advisors/model_advisor.py +++ b/meta_automl/meta_algorithm/model_advisors/model_advisor.py @@ -1,8 +1,9 @@ from abc import abstractmethod -from typing import List, Dict, Iterable, Optional +from typing import List, Dict, Iterable import pandas as pd +from meta_automl.data_preparation.dataset import DatasetIDType from meta_automl.data_preparation.model import Model from meta_automl.meta_algorithm.datasets_similarity_assessors import DatasetsSimilarityAssessor @@ -17,13 +18,13 @@ def predict(self, *args, **kwargs) -> List[List[Model]]: class SimpleSimilarityModelAdvisor(ModelAdvisor): def __init__(self, fitted_similarity_assessor: DatasetsSimilarityAssessor): self.similarity_assessor = fitted_similarity_assessor - self.best_models: Dict[str, List[Model]] = {} + self.best_models: Dict[DatasetIDType, List[Model]] = {} @property def datasets(self): return self.similarity_assessor.datasets - def fit(self, dataset_names_to_best_pipelines: Dict[str, List[Model]]): + def fit(self, dataset_names_to_best_pipelines: Dict[DatasetIDType, List[Model]]): self.best_models.update(dataset_names_to_best_pipelines) return self diff --git a/requirements.txt b/requirements.txt index eca13d853ca1f8e55c583bd3790a78a679ffee4d..ad0a22332f176f2c866188116575624428ac1536 100644 GIT binary patch delta 38 pcmZ3-e1>_$I!3t?h75*OhIEE}h8!TB%U}zH1`K)(MnG)9006?}>0$1-Lige-*6QG}uhMaV)3MJPf?D2k#8RTheng`#wy z&wHP*{p8_2*EZe1SA6gH_vicjJ%8TkJ?GfD=1eyI#h?ClH(aZi-nM7|uA6qC`wbfT`C#Cg~wV(czwLf0lvbyQ^UAOI7TYb&0 zyWY3|&RzR%-F5rg>Unpst-kj5yKY-~{~K<()!EqT*Hs6yZvi7AG-3y{Vs0umi0B}XZ$fX zHikugHqzGp7wOWs&9`%1q)T0!kKgu*J{(>AmXojhL^BpkPWEbfJ^Gz{9Br*1>$NDK zc-=oeYp(6ojExgrH1S)mYd5k}*J|pxwpTQ)F%!RvW+OF}ocnnvAFK z6+b){OD;SVHnoX`Yi;ast4X|OHMQT`b&oM^9b06#Js0_@57)Y}*jPTgu$tK6#MU$G zKkHtt-PYCl<$Cq=(-Y6HX-0Et7tXG>K^XtB2W7YXZ^{pm-YJD`}?VH&zZu*O zwv+$Ww|0|9us;l}+eB;&_z*Y`|3dg*!hZxV#$JY3;WxoQ3LeD%2G~aL2iE<`pTM60 zpCi5x{&!#%JcR#Y@GSUm_($NYz^6dR9KVm)gW%`D^~4Ua*9H0{#!iz=hy^umXMqJW1{cVb}1p@MG{*@N@LL9ex=9_uymj zhu~iU2eH2mT;r?24lqN1BeC7sKLu9s*^ll$*f)cdrN*_mC;IoYUmqdoTHnoj{tR}F z*T9#9Uj|p_07GmoyLhxcRoif;1X>Epeda&F!R=e~UqyCbIdFR?GA?kMa0UF^feyvNRYEA^byc^ z?w7zj!NvHW06&7g1-6&Zj_2_L@OrS7J-HWNBj-Ky-rD;=vqtyY=ZiH>9>uKNBoILy6@P~VW?VkJ^yhN>g?3!ob*NM4D z2eCcNYV^IA9|he1?lSlgVC};h+f3|@Xq=xl$MHV^d)6-n^qqVh{#U^B_*=j;y#;)R zKKFpPVE;$pzU&0=1wJ!-VE5106rZ^X_vu#bONi|R&fR-|J!5$O-Nz+xH+T;0BDMy4 z>fDbV-~d|h-7)(8Cj9#V{lxKI_g}KE|BAhg?V7!x6Yw$YYvFyM!#)AtfPE3{YnHDE z&gWt953oHiuJy;kW5iwnuFq$HIduOR)}Jvqd#T~sI@Q}XUcy|QKibLt@b$oF{~B`6 z>o0)w;!Jdpfb)Spe-b#a_rRX-dtvXB&&{jBSIN0Ij(Hxv{|ELe_TK{U^^?Fp#_k91 zgJ=Gy!8fosgKMze2hVd4ei8g9@OfhH%k}IjXSur!^u&G`_Vwl=U~kXB$FYA4{z334 z_6^{5*lVz7)${S|@SlO70sjcRNbIlRIs0b6e~SHDa4+`1hP|g>!u}!nGI$Bx&l->5 z`}*jacP&1-;41_1yYwxE^1Vz6=hd-v<8#Jsu|hG2nT=2>-A?xn*iUjO{+T zUhk1{pUrDv&y#EZ9QIydKlW*OAAAP;ugQIse%{a9z_*CGCeNfbKLGn2AIE+b?Af)) zAsBnIO3b{WH0u3 z@%sHr?!$fnytlx4xvoFNe;4pM`~&zNuoL?-;Jx1m&L>8X;r-Ba<=(%7n6G!=1G|6j z&oSV0>G|6PT$`_zEAXd*=lxmcV&1*D3%&{*!9GFmA7WpM?OZOv_I!N>TkY4V^O=4< z_&h%QH_ZJ3>e##C-mb#86MqWc0@N~I_gUbxpw|2DS@7)eoDJvL@jQnQ0M;}-OE+M@ z9W84c=6e9!e%pz6;056PJzL+x-T_>@`{Did^~-TLfo}uzPlBbwAA+9)K2yGT8fT3| zt@X~|nja!}xp7`&)?i;_4#LOLEMa?}9M}45;H`l3H|)P_@;S8LGw5@;6I?>x-W@%C z<~+ZTz}~|tF>?s*K9eF3}?oon#9w9cAC*smo%qi(qO#CqUssdIB)o)bTN9)o-I z);X`Ou(tQldvEVez?|<*zRr9Z-~G50_I_Pabn2boRiI;RKVNy)K7qd(=GhtQ?B&^V zzV7oja1h&P)3vETNq!0R^!ah(&f!wnGw7V`>sZ&~JJ+Z19e)RGz5DFGyBEG*4EGlM zGsK;{b2F#TI`_n9$Y)sF{r4F>jBP)ko!mFi)=p}jtFak#bUr=)r-5tP4xGzT*fC~k zjQjj_#Cgvi=I?vWar`&K?!|*>yeI0d{UUHaYFy6=?4O10<-E3mqvURY9p|m&T!-^= zoxb+IgujH{1M7x;f-eBOfpzMS!>>f|c<#*v$Ucjn@q6LNz&>DXncNBBKDZ}7%a6nR z!NtJ7KTMtX?C#=U4x9gNVl&q0{G8LXz%@FKdvA=-9>X}EIp?9?v*x*R-R_s~De9fG z>vA710>^;;9)?$e&*jen*RU6OW|r~o<5)+)Q(zOAGLE>EynEuld4JXY1~L0x0-Vzh z*m16b)w_?5|9RlPKLFGpM&o{n%fYPF9Ke1O_#Si*JO{qEKLZ~{nE{2+lR63H6`yD?uGa5ZRG6bp1ZC` z!Exg5ou4JVS89y8r}njW0y=Eh=I6*8VEehgjxpQ=_hK`?&!fGEbz*xath0~L!WL}z z;{?9X(q{Z$D;me|u`ht#Pj$Xlc{W{-xt==bBfE!J0r%tQ;fsLd`r7*#G1qw#*vIG8 zIqbx@x9r?ptLu;*SKIaY8UF?NAaES_$n)dp2YVTR1X$<$sQc=3<5o&pOYsYpz?t!r>_F-@k+qGQ?F9Ut|dlk%y`&#u`Y@dZIvE5tua5#tDi@Vjq-gn2a z&NVr{p9^}}wM@X}!10&KJC18OQtYdVe-q#RbRTEH_cPb~BJ7;4cTSGu^X(jb9Ww5E zJVR^Pj&}n8Lj2zZu7792A^2Ki_rlKU>);jm*<`Goz6rL!?=LUFJIJ|D#8)DKZ?db139Mso+swm%b)|+`>gy7*h0>@=hZRXTi@e4&W$>eD}LB`?C5Gg zcCD-JI^V|iy=qPDn#VMWh1Iv$q8@d8k=^!Pq&u5<-7ES!R@=$Tx$0Q#)P$o?zUEp} z$73fi=Sr^S!Y0gJkz7)c3v4L@tSM9?X!`ex>jS{ zs~t1Cd-5s9!YUMY@gRXItCqT7PT|HC}bD_G?>B{BT`& zE_PjC+h^0G)$4n0RNLkk`E5Ks>$tX;xo~tXuPeS&+t#&y+vo4&e>c5T6VCYh^;O4X z%U(4v%B?G3e(&;LE_Zc)VF-L^^4|WKd-jWdUX9~or^YH=VG5rF4~sU zOHQtiCB7(EYu7uBuQ7dU{i0Yj8ME%2STq~;7}KV%&ee9CPp;*Rn|SIHU*xOzYITX# zb=rNu{^uZh0;a|9VE=3o%?>{YgE?_+xd(n`o)+7@wn*KOBi<96H%K3Q%*c7E*H>%4 z2gaf`?l`j8T#tH&QM8O;0jan$tK`m(ubU2jLs zoKUN;#;fm7@5`w?xhe5k$ysBr2iEna*7*sw=Aw~1FfBZTt;u@K2OXF8&=buP(4PqA zK7}_E^D^!gp7rateh+eG9P5NV?WL_&#I|k*)EMi)j-sJ!+QI{9BQl|YK#GO!!(GduERIxHRSs;KHL>pYmM5}3`}fbQt}iGL23 z-kdOQ&5YO-bl9?*4yZA&ogUgfm;&>|+PifdI#s7;4(zGUtCvA?#_eNV?Vv|uTzdwF z3)Y?jwO-nDP}jybuU-uEr^cQ=erJ8bGHefh^BuP98FbjaF)z7jrv3U}ysVCN?a$Dt zSG!~_Y+k!BzOh*qH}7T4T+WpK)L5>5z1JM}l7+q1+230IK?C>1XGJ4-g1GloSTiMN zjXC`uHaCOk1?G)+aO%u?4NSc2PunwWVa!@SFiU`f}`9lkd2UdCAeui=CM90O=`RV(KPEV_rWwtI@SZwoS zwlQ;}126Lpm6$xKY*H`FwfWT5HM3H0ZS3g6UD4#;CO#uRDY?Yt#AYQoFC4#bYC5hu z7GM1YB-ddZi(j+xuE`M@w6ghHKW0xFP3zky*Q`I?Gq$#LY{ zw6W(+-gt6y?%7OxfwtpP>!n7mFc$8(^v4j5Ij>ryKk2_SR%?=1YkXSt#%IMgu3h`n z##a+eirsM~*79rvO>)*ufPRpeER4%uwXYov$&!!8*rb1XjecV4v}Yq4b7CxKEEv<& zF>NndObW-J6~AU}uf+QP<+YRIxA|5Vzh!$Hn}G>Pp71H#Tpic1W3{cW_VY{_uhDVU zerr$Ad)4t-KZJ(<+M>?4_H;y(noF)+Kl>;7OzRx|{e_Fbgqnl!0q_KP82I}Z{{F>r zuoHA>w5Qf$`+FM3Ciqi$2K>F1&k&zr_wY6_C-#sr#(M_73ivxJ9hw91*T9#7v7Xop z`eA->hyMUPg|+Qx|D7e*6T1Q5-^noN@0QGn`Fk7o@b^aM_^#0!`759!rth4V@nvI| zVXN_XU+g8O#E+wO9sXX&Py^4Z9@hAKC_ORrbGXw6d#_yYp=MsdiX`drpaezQktO9k>_2ht02mCE(ih=futJ#_!+> za1G`zfnAf@wPHKg7qFef9Id(&!1X%c9k4YMa&!Bk@8Cy??If??<2&EG!R8Y4p4iL0 zYq|$41NYxOoKiCf_E-V;lXD*(&zhy;JGT0d!QLnJ`mSjT>pOpIPQuq4r>}jU1)JDs zb7~*P*LIw)#EpFkI0t)8sde7xlYfx7XLe4k1IMtRBj?)eZ=VTq>%NM87~TZWiHW}O zVPY2p`x@_H_jZbH%(cv6?Vgx>4Citw@OgT%M5%Wy&;3e?yGQms0L;06-Xr^3+kwNyf1%`^=MikjGxu%S`OO)}Irs3r@C06i zy>DB<9N&KS?y((D+p&A%(-K?8?y$4R@>9UG&=a4Pv2re__>SwIn*Rj0YxpL-1SZ6s z^91Y!9sbkC8T%nvzX$q`?f$qg?%BM=H{*9;GckJ_x7Vk!t#geTLqGfP+%k^$)Vc{! z>l~NN3u1@iIq)^Yy_tiZ=I9rgn*pEC;T)EHNBve}jyuP;w`{%h)^<$K?>+eHdSYv^ zz3nw4m%bBn?&%!1#}wO`^Xg!0JdegY^26GR9VK=EOo@Actes%1UBbQqc5F3|g9$l% zsdHV%J&!Me8-ckV&39D_1*uLg=!p_0`9RD!Bdf^=83(0v_Uj)u+1}3G}*ptBc zoY+tUPeBKK4rlb)1e!G!!0?EP9I-s2yIy_e4Iv*0pdo%7j`J;Oc@K7_5t_>{U&VQ(hxJe^U#C>+7)X>pX_N!tcSoC5PhFHS8Jlk-ciJbBRY2 z|7^JK)7IDiM!Jln-mCT3qwDeH(yz{?w$`+E`Hso)z-P#a<0VmV}}!qHr(cGzt*HadAYrki?-%y z7irQj`EYu~kFM=^rhN427f!!$bn(--jgR?l%|^A!r6%(UC!cW=ula0p=@U+U#!Y;T z>;83JYLkzj9%I_%!qJ4&BYw+u-dwK4>%Pg!ZM^kcP4eON($7`fi(++st4}WR`1Y#p z_~ANN+igBMIk|Ofy?4w&T9aPkS|3~X8mn8+oZg4l zAFIh&buRr=-*T&sKE2}CT=!_}<42P|HMe$KGgh0Lb{?rszV;KVYuZ@+nze(C=F-+g z+m6@jYQNQ{M>MT(Jh?Hh^Xc8{HtN~t#(Jb*axEuc&!x6AM(XNZTbo?sbxrKLU+n1X z`ovO~+?ZeYsdaTcb~w6r9P`Q5+S*P{&9Pfu+bemwtx0YyKGvs>jn!m~K=(F7C&|MIEh7D=jzzG*cmUJ zo*65CdbS*09p5OHT-&$yYkQ-4)ni7NSUXPLv$mrT*L8KQtr^p)Pds*A7rW++5w7dg zqvqJPE_OJ*!p4&eC)f6?^L1UVPfcR2uI_OrZQY~Q*15W8#%p7>K6ZLWJI0CCHK|(^ z+o&eFmg}Ay1%n?h^*z2A`cvEX9Q@?r z4mi%F#LVjt@e-TEqA&ibo+Z}Co%1P8J+IO?xoPRwHNE}&>S$)C@v+ODtUX(XKTe+5u%=j_>?ZqdmmXtyFQ>x1;i-AGl~| zu(~=|_j26i=9O)1$ThXC#@_W@)2rqweTQE3oMG3wm(7iw|2?J7*Y=_whpDme6sYNo zUDrO~Jc%1KFSMtH6R%l2e%2_bPx7*{K_dfWM~k)|<7GAEC!&lqr>?h_*uW)jeyP%R zrGM(=Hl{E7!nHPbH1Xv*`iZEqhJhT(z3_W=^c&`fKdm(OvM%E~o_1ep;wNTJ>{+9mP;Wl{g}lS7RjB_BS3~c*(w1t$ykpFMBieMCUvvaBAjNZN{x*UCGa2 zU~je`^J<|kV^U(T&s02>)Q12{a?H8liZYk*2Q0PJYuzWxyq+k z^mDWsGi<&2DO}g4X5gZ!W2q0uE{PORp! zd6=)`wLW&8&pgkRt97w!UR0lWI5qLdI2ze2{Cyr8a(DoBCE$YZv9t zq-n>ApMLGw$&bY|R$Uu?a$|MpqDg&fT0i52ldt{MCKhhvZEdTIzSgy|_^Ayi-qzN6 z?K+=a8*er7hhlz=>1$1$PrQxQadXKV_ew0>#^TFf(WOuP_3Zl3#5U5@dhOK2uJd)S z&ZmF0t?pd3ZLjnUCm&8cyiwd-^07Da?V0>o|2j8j8w=8_*444-!pWU!+%{j=x4HN= z*LrQQnq#kL7O$t)#cugbJ?i|JZOon<^^GR=;nc(*)3v?hr*_P*`_(<`n#9{!;%CDf zY0P=0|GKySvyQcP{BU~Ie%)&7g`MyJ zt-1Vt##&TsJbyl@*Re=nYyLj=M*TO+jg5J(`q3W!cizXd?K`F$i)GwJxx{6!=vyAU zU#;f5_0OjFMsph*H~R3H=3HanO|L~YWAU+AJ;yWUYwiCl`@5Orxz?OXpL}@HUYsev zsBTgGY|inb+C}mI-E&+t&PMTe4*w_CS&!e2mt4jj%a6rU7mg-<5^p)WHeUN@qp9_= zt@Db1uAJKNxz>;A?d3I=Yh!1tYwK$NY%JqNW`+3;dVgHU9TlVjD{kLiQp9B~2Z;ZFX>ijqU@>k*G=!{ilxjaHw%k6KWjma_}gyr{58gb)Up{o%?~`7h#>SW*gAoYrGfg?B!Y<&$-)U3vB!uVEvbY|NjJAVaM@a4C5BowuklKDy-J;+i?$ve6iIz zhTkXS_q~~WlGvf5v%mAY5ooLb2C&9|=kEPA_WDxmf9v-=J_XzBer(6i+T4%ybR6e% zJ=hKQ0q5*`jBA?{es9zYJ++<7^T7VA@K&JK`M6gnfjvCi&P{t*Gwi<$b$r)rOq@XH z+WoFFbBP}(Zg1x<_Ltm|!r9A@;a^x{+Y8&{VeChN`!V!{A1Jo-GA4J7V}HMMWeP{{ z9vgT5CxGi;2KxE_AM@&-gf9b*r|;Q4ik(LWB{^NiWC7Wcrh)i_S(B0I)Ope@{o=Yi+z3E1z;ajo{weQ-_DyZ@gf_p>GD zSech&`tSc8)7l;w^Sg?y_nAG3uiklF0BYoRL4$b4(C%{*MYZlJcd40XGw)K85 zo&8=3Td&sp>^ij7JO!L<)|PSX;n=yy?up++W!=sazXWz}*MQFebL!n|Yv-)%QP}T~ z@;tciCEyreg13RYi@z6kOy}#o)YwCI&d0Fr^<0_1<2v4}={+Tuc{{%2%t2!E4fxiZ zcP?GYJH{7_opEL7GU!=@{Trh$_)}nxZEh2AJ?cG2 zvhjn!dAp{IfMaJ2=j*)nv$wuRz7E*Snq}ZT?}hE_y3+3@Y}ae;=YYM;T?nkze+0J1 z@5S;v3^U$QVp~D>>)Y7&5yqX<7WDQzS^OvHr+y7=1Fp~B+I~l@zWrY){4i`E;e7la zG;P=IxwJODt;^gpC$-wv`X1`%jP#Qa5tn^k@Y#7DY!*fLQEWeBso7rG@uvOy_abIK z>G=h6+V1%uqMtG+zaR4;{v6wJj)5D1=O}f)4*Q*b`diEwd(Jq~$llX!C1)?cH%(0k zPLT7p&GYedMPtpB+-Het=Ui!DW(;^%^?iQi4)!yu`=B5AU57UT_x^$=*D;2=d0}nm z_Ze^;=x?U)my5kz#!7GJEjsFNWQ?64xv=L@+wYT;{a!cc_)gf*@NWcf2hm?u>`$Xv z#=aZ&&n6vX_}&wpY)tkw!|xZ>-cdMt&p}U}-y7(;GcN9^bkV+-m_0pvv(np~?+x;5 zsdq2+f4%hZGl{Q%#@vskW}l|?I$p)@FWOHQ&V9B%xQLwdh#&TO*ay5X`%2#L&oqA< z+%cAO&|ic7{Jf+1#(ljoH$!)6vGx5d;p?XNC->vGu!r+TUQI`htmXR1=w5V(!R^?|JyX^C z`ueHj58u5=pEna*p;zvwb$waSk!p--ZIk~teU?hEDgHIs?ydFTDx5jUzCOyc(%aX- ztl@a+y?vz4d-pW)oR6@*9;o!o=zI;Ud-ROeff?|AXxDRceJ6;g=U1^^L(iUESM8xS ze*bi8H=)s<)6dVg$IATO6VF*+#@UIT8rKo~+S02pv3D2$O|WzNHQ2u!uupnf`xasy zG2ffgOLneVL*o8;lyV6pXmE!!N)3J{ch@I&#!-u zi7tFO@frE}vaeP0m6iVYv0rwU82islt$*&AG(9dM_Xos}lwS7B>zVg)j{a8ieW~l3 z`g%UvN2yVhIb2PxXW$;M8&_=zfo!n&)pIau=6IX}N^Ronba z_)iu6oZ9rhp4?{QzYH&_uX27azQq_la>hJ!K9ip9kAtq#tMkts2da2q@=p^pU%&2V z4o}eYb<}x(jh)1oXQlRG;vX)#t+0Q_KURGIj5uhEpPsVouJ>eEAMw47*7c|EFk@R6 z|8151)7ZPw=CgojUtK@UcWwXL&8z3Ve{Jt;_I}^h-3Qk8{=f&<-nF*n>hrGs6N%V7E3<4cHFwI&BYI=S88gmbBWh=vE@Zr)FZXY z#}C)>je18PPF*Kpj;{3+52wEN6AQPwIv!gN)FTTjoc?LN&eeY6W4tJq_~{YWt;EC0>$fnMSafwPx$s!7u4`@Ua-~ONVe`qw zPEE_HskyE_Q+@KOKb!v1WSq09-AI!@nM=5?i5+e=iOV(AdA0dPu|a$WeKg^^cVaEq zn%J#I-&~#F$WE@UY5Ugs+OBo!Wh{2BjV%wKhhKZkuKj!7d&ix(-m+`oO}E~@8{U7% zoom0pw&l=$_Z>QP2>8GI_N@NEuKoM(y!Gb0_V2mr_FcE_d7qlq^RIa08b5yCAHU#_ zt^RnEKYr04Z}!LK{`jS})mL2c#^vEBHP{sKlPZ30UE8zznp<|=vUkr-E4%jZUAu1e S6*t|wd+plSPVc(;4gU)*%LZrw diff --git a/test/data/datasets/monks-problems-1.pkl b/test/data/datasets/monks-problems-1.pkl deleted file mode 100644 index 58061c482d71f3bd84e61d8c42cd1b7eb45666e7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16009 zcmeHOO>fgc5Ow+isDR3WN*pg0@04fKdmG;)< zfA9}sXXEE=oORgBHQ01VTEE>lZ)V5qaoUTIH-3D6?nquj?dO-DXOt-aglrzbPPI;%b7&1UoUFJbc1a3bD5Ik>w&yT|HXk6$=W z6^7%Avn|`G-s83vm*@nl@ZQ-P%lrFSy5GaH{XW}zR{za3_$|DM_y0os)S%{7l05fw zuKQS3gyJ);=UjWcVgub!_O0U7^sLIN(>@d3b1ExW#imheE^A+y)@A!gd6@rTYWkeY zs{7yiM|aMqZJ~<=XOaWjKVjZndE&ETbv5g>COVylI1|f?1#glgj|Te`g?VZ-i7l#b z9;ox1e!nUT}m9|I5X}@Gtl%mWD9+Rb0}}q*`sVZ>&*sCV5i6 zgm;AlL($Mw&5PO+EyV}}uWD6Z>ZNk7)pD^$f-9~Vl9*kvSK`xXq!K((lGzvJ&BCSq zkQbPpT6(hbg17nSy@4}P(VmYzpH;7z+AF%b?H}7emF^!`R4)egHV|`{e6cNYu5gG! z8u9_bscYg?4f)8_oQYn;LrWSNE6|}4TRKmY%bn}N0TGjxN2ZSB5kr_5<^m;;wGbq^ z#T-VQx8czB}q|&%ZhntJ`d?uY76LyR-bA(dog@YmQSBH)b+s z_dmOmKCflv(D6Es_9s5ex2itGZsUbNDVh;_T9hb4l}9 zTmNED;aq^1<$Q3TRYg3nG~#~JJl584`yV|}MU?;d>~y@ye;t57G zq7r-vO{M*oh2O#-;{8X!%T0 zv!okLV%{CZY|NE>ewU@}Gm?+g)$i3KwISQyco-{rr3@cMDUTx-CfzZM*pSDGk}qtB zlVIhOLk^5MYRu+R`T^I8&CK3 zXE4sfbVNshV0tkRCQ*{cS&-s6I|Nj66+SbJ0O;J-f!eWrg@@U2E!R0v;$7LHBcO@> zMoBTG}m$96}5M_6G4}>}P`kk73!3pRU=OVJCW8jho%1ooMY-05?9}DFK_r@H8<> zyf5w1rs1pDtg^KQ%)EQeJ^-5Ts1%s_2zc+y=lpC0Srb7jc9L)uV4k`yNCpde@7nkj z7sX5Bl1Y5ArZZVw7VF{)B%^p+yd&PjbP^loWV(*&A-2R#aSIbe+}1fEK7?ElAI%~9 zu@7%f2 zD3sVH8jDX05{oZ@ulCZ6h1?JYV;Az3PG>@rAGT1K!`D@{u^t+4g`_>y(3VvwYJyp? zD%D`cu!=IH0#FkSqAJy3#juJpqXJM945BL4V8yVCGNS@e6AYp%)nLW2iZY`DP!kNI zD$VDs?K(^I6|XA=Ul)GE+Hk3`EV^0X24P4~t;L+fG3qstWH=%rp+mhkMj*-v&Rymb zz%_^fgq~Spu+Swo)L}y%HmyC^pOrYwXRPD4mu76c%2;V_(S`Y`_X69tY~Y21{BwX? zj^%&=ZyN*$m5xQvmgP9aThJhckO0dKK?gqU7EGXqIEO5@S`NHm0-b>o=wKO#JR>h* zMO^#5e+J?(pS5kbvovc@a4!Hh!V4G!Z|WeYaYVn`q{SqsAwEfiy^-`Z;kBA($%po( zvH91g_KiMsn09vlJo^!t&)1H-^55}QTGy4>`q!H*QAhKKD2u}$daUbVOX!$AZPW%m_m}DNr}{Lgw=@QHM$9LtqYd;~1k~#INEv@w@m#{3-t8k5%^{MmfBy literal 0 HcmV?d00001 diff --git a/test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq b/test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq new file mode 100644 index 0000000000000000000000000000000000000000..34aeff80b70eaa803ee1c0a02805fe86430545a7 GIT binary patch literal 6016 zcmd5=U1%HG6`mP;9mmbq+r^{GU^cGU3~A!n8vS_fg*?p2)?Xw=maX4zab`w8mNFV? zrIBSh@os3LloFPOX17oZH7$j#NoiP0-dchm_MwmcflyfbL-wVxB~K;ngXuYUMzSo~ z>$DELJBhFFJ@?*ozVn@P@3}ZC#CUd)eTB7!*dccCWr`Y|qbZ8wv2kGIWGQx5tn-2I^*U`_+rKWWQ{H22{P+}t;B=1*F2jXi3|M}^aQ{BV-9L2V9zxy7Cn z8yBz)e*Cc450}C7`}Ez8A3y!%-Cs7}`fK>D$A7x`_~$o%BEIvt$>~?4clhtSKK=94 z$I;KCdkm#9;KgUq@BE{<-7`I2xD7Av4Onh7+};THV1#%Ql?#-p;@bA8t`1Iqy|N1vO4`^y)bccK++^x|bc(Vq)A2F8S)7*bv? zCQtg{*$LA??K?D^>Vd~wX*&?_+i1fZqX&kq)M;UO=+-2CT8`ZPemI>?}Bt4UfR zXPcU|X;L`{0oT>Pro_sN*i& z3}CBztBM9~4XvnDvId4mx!USXROC`l(GgoYr%PE$mtgq~TTX-JMXi8)<15}vbGEH6 zFb7fCNS#I%2Y~F0d$1^yo-0&pMMY8xCkOBpHGQrRjn*iyws#Q-T!$kf;^&X(S)%4X91#&ee?+cl;Ar-6 zL`3}j5givbQd9a7b;Mr`B62U)a*|%Dk?TS&$~CFh!rP+Sl4`X|6CtkWYV~48L+R|a zeVVty8MT=JyXN?5$F$S7MOn`XvDnq*>qf=4K4NF^+b9GCL0Ezx{H_RsOArf!R}cz< zr$A()cwoMX!o|+KzW{4-3?I=#=0RsufPL^Am556s%)uB|24KDTTA-{pGQNZ><%q1y z{)8@T;o{YzDCb(inv`^350ol}fEHYn%ULy0Y!(7#_nMrH!gree%ttr|P z$b_ajUROMU*}y`akKi6HV$Y-moZrYN?Dc>rSdtwZyqt9B(MBeIu&GVBf^`NoT#-!Q~0A zW&N>McsYRB%2F~>&v@q>=_Icb9?M>P)*sj)b~O<=1>X31doCrK^O3czFQLiKU^VUV z)=j?q@B?S|s~%!Q&j9X>&udR3FMhzHC3J{oM^7e&@f0>OE}$2<4yqZ94dBcs6Z84y zCi>iqv1QoKR5Qtq>nTS&CPEQ-vMZ%Pk;K{Oh1~IOc5@53?U`hDU4a;+lTES?F)4nv zJ4RJmTXG%2Kjamk@6u!B#zGi51-&s8TtFw&LjTw^C8aXpySyaFH4iUUs!YrEoEi^$<&= z-@K2ZY>HSq#ezo=JWZ^n38Sn++4R)0bSAMeifnvleP#v*q8P)XODe12!|6^b2PsbZ z46bpex&GSB3@OixIAhqyj`?|_oq2o{2m+2BD;H;G=3ubs#X@gZ`ZRV8tJuT$f*=@o zG?`k)rv=J{7-`8o>5)cEoA?2)@jk$@-nPFKkK?i~kjDFh?8PaO)Vm1rF9enlpCEQU zLdZ(eTq&9O1rdma7tR8X7fpQ6z#pldcHCMAjNSaf6X(Epv5g-dIXE$gV}8ZN*X9rX zC6A&b_zUB^3rK7VfY&bw6>7NlALj?Ev`?M8kO z|5|7Vf)qw*kcaRNG`}U{l*ezv->3TjQ-6U!X-461@Bal0yk|%N literal 0 HcmV?d00001 diff --git a/test/data/openml_cache/org/openml/www/datasets/333/description.xml b/test/data/openml_cache/org/openml/www/datasets/333/description.xml new file mode 100644 index 00000000..4c00296e --- /dev/null +++ b/test/data/openml_cache/org/openml/www/datasets/333/description.xml @@ -0,0 +1,33 @@ + + 333 + monks-problems-1 + 1 + **Author**: Sebastian Thrun (Carnegie Mellon University) +**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/MONK's+Problems) - October 1992 +**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html) + +**The Monk's Problems: Problem 1** +Once upon a time, in July 1991, the monks of Corsendonk Priory were faced with a school held in their priory, namely the 2nd European Summer School on Machine Learning. After listening more than one week to a wide variety of learning algorithms, they felt rather confused: Which algorithm would be optimal? And which one to avoid? As a consequence of this dilemma, they created a simple task on which all learning algorithms ought to be compared: the three MONK's problems. + +The target concept associated with the 1st Monk's problem is the binary outcome of the logical formula: +MONK-1: (a1 == a2) or (a5 == 1) + +In this dataset, the original train and test sets were merged to allow other sampling procedures. However, the original train-test splits can be found as one of the OpenML tasks. + +### Attribute information: +* attr1: 1, 2, 3 +* attr2: 1, 2, 3 +* attr3: 1, 2 +* attr4: 1, 2, 3 +* attr5: 1, 2, 3, 4 +* attr6: 1, 2 + +### Relevant papers +The MONK's Problems - A Performance Comparison of Different Learning Algorithms, by S.B. Thrun, J. Bala, E. Bloedorn, I. Bratko, B. Cestnik, J. Cheng, K. De Jong, S. Dzeroski, S.E. Fahlman, D. Fisher, R. Hamann, K. Kaufman, S. Keller, I. Kononenko, J. Kreuziger, R.S. Michalski, T. Mitchell, P. Pachowicz, Y. Reich H. Vafaie, W. Van de Welde, W. Wenzel, J. Wnek, and J. Zhang. Technical Report CS-CMU-91-197, Carnegie Mellon University, Dec. 1991. + 1 + ARFF + Sebastian Thrun 1992-10-01 2014-08-26T17:11:18 + English Public https://api.openml.org/data/v1/download/52236/monks-problems-1.arff + http://openml1.win.tue.nl/dataset333/dataset_333.pq 52236 class https://archive.ics.uci.edu/ml/citation_policy.html artificialmythbusting_1OpenML100study_1study_123study_135study_14study_144study_15study_20study_34study_41study_50study_52study_7uci public https://archive.ics.uci.edu/ml/datasets/MONK's+Problems https://link.springer.com/article/10.1023/A:1022622132310 http://openml1.win.tue.nl/dataset333/dataset_333.pq active + 2020-11-20 18:58:56 6cd008dccee6a34420c091dfe7cdb457 + diff --git a/test/data/openml_cache/org/openml/www/datasets/333/features.xml b/test/data/openml_cache/org/openml/www/datasets/333/features.xml new file mode 100644 index 00000000..6cca4738 --- /dev/null +++ b/test/data/openml_cache/org/openml/www/datasets/333/features.xml @@ -0,0 +1,84 @@ + + + 0 + class + nominal + 0 + 1 + true + false + false + 0 + + + 1 + attr1 + nominal + 1 + 2 + 3 + false + false + false + 0 + + + 2 + attr2 + nominal + 1 + 2 + 3 + false + false + false + 0 + + + 3 + attr3 + nominal + 1 + 2 + false + false + false + 0 + + + 4 + attr4 + nominal + 1 + 2 + 3 + false + false + false + 0 + + + 5 + attr5 + nominal + 1 + 2 + 3 + 4 + false + false + false + 0 + + + 6 + attr6 + nominal + 1 + 2 + false + false + false + 0 + + diff --git a/test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl b/test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl new file mode 100644 index 0000000000000000000000000000000000000000..03189bf8369f06b88fd0cd4bd39fe6a701908ab6 GIT binary patch literal 509 zcmZ|Ky-ve05C`zMiBnZbMGR%)0br@<2Rs2{38+tiB9|H*DR!&aMlc}3ejCS&@&p_q zrXGT)lka~2v){7MFP60D6;F!>UXMugpe1BZXu}Xd34CwXhap^b=OPmmS>=%~n@N)Cl9QDxXB{rNujnVCM*Ih7u;m z>$Pl3U0GY1$1qbQeqAt?{%%w`)kEOIxn_oHv=3JYl%~eO=9ITUsSaB?h~yy1d7o4h tbsSV0sWhlXzX~RcOcuNb#U9{!_q~e?QA3nFo2gr0eBrh?bh)obI92_rzg^3el$sj-&J3z3V#b8^;LXOv1c1!{! zO330ArzMF4UDDQVX;a$igy}HTb~=kPO=r@Xw!7hL`0r>7SeHk68wOw0^VPYjOb8pa1F22T%9=N1?I?PHTeqn*0H_!ere zW|jW2k#w%1sI$0G-ak7rH8M0hHr$^kC6r#MnwTA*dPaML&Ee^hp~Z!={>=2`czQOI z8J;G{%a>MZ?^PK;64$+Bx-JZIzl-brTvu@2r0b;_UFZ9CeLbh^<&3V4N#-Babs?+k zORQITis@`GAJ=^&sq6SrT}K9VoloexFu{DrH}>kj!FKbkx5WK{A(nqc*ZC7%AJBCL z`yFR~)0CzcX1VTV`Egw@vEO;AS+Br$3w>HH!um$H?i(q_>EV7v*GoM9IQy|Q%yNfx zE#o(2{3n^uabIS?j8RRmVE-#P&I8;p`thKa8{jzR*`KA`8M|2(M zxI}o|%ir_0`ucs8;mb@ig{tbgkNBPrZ`@b*8-1YO6Cva zJi_QT-$Ch^Vld@aa)<$0ZF zd&@kIysST&hmY&{MFe-B?n^w3xb8Q}`iScO6!&imy;{?kc;C2vy)AQo4MWQnad{od zeqrp={J6{?j^B0mPvV^C{o)AYM%b>*BjbL?;q@;2j1gn|4{$B{{W#Bu0Dql{bDz05LOnMC|gpB7POvZgGEx)EZ(c-|D)kBFSNJZ{<7^D>V)P6HBm&RcO^uw8@a z$ujF1TiC9QpYv>Rej_~JWFE;rT#&plPLZF`cm4HGfI+#rt4h_I3962*-7q;~@KEfyW{Hv?1pc&%6A?8h=^F#qk;te73jDwVX3c z(wFhE{|29PMg@qS`aBc7KAuOG>CaeZ=r3cPPE@x0H=`62rz-)|H+ z5BYZOPw_rW{|nA-Ix7E@><}1K<+p4j4%0H;=D>6mRVl-vhEDl6Py8_PjdbU zU-Fn|oJ$-p!M!Q*H zK<)?Pd~V3Rk#R=Y&UGHIoPPzmpXPNf_j}?`QI7K@^Fj9ch~$@XjCSo;UiS02?i;cX z%6`J@Xn^Nkp4WGAzVJHvNce2WILPBpbIs>y{t?|5zn54(&ikp%qX_HenQnAwy|~1k z$0_@2ocE6i=Xrqlt+?Rx{E+ixiTySBoGq|_%Phad_T_$RfcJe_r*Xdjii@8~?Vs!q z1)fh8oX5EAgIlzm%JE;APa~x&gj?2Cw^MP^1?>yU;{N&|ahZpWIhm zmwC$f;c*`S0P78~Kl1!hJZFz{et3RM9t%9)soU03p2uyly-Uo$&h;|exyk)F?<0oX zm$qwra-WlD`%4@@qgnIie!Yq3vmxW+eL?oWo6MK|%Kjz$UxEFRaTGYN5vkMKo*{kq zqagX>d>-L>XdGdArWZIK#eG-u#_L-4Ke<1M^Lf%FcZa(+ypVH1^3lyW59xY2t?T?gT}xc#TwWHxc|Vi+R^a=IDUN?} zU*Ypw=AXpf*va^8zqoFXYI=e5V94_=pKpRIaNxP_{Q(Nl!Atus^B}#X3Gc19=Dsj8s^%Ee6?B2 zV^Dvr`$}dL{MPrLtiJcL_cK=-c9@se_j-T38aA|C{LS{y&h>8eSzp+6bw0Dc;V0c| zuLOEiH!8oMTz{haQf2B&?IS;}{oqdy_PzuSXM=O#Uf=XY`}V-2DR=v;o-a1M9{$e7 z&OY~}r)$GE&U~}Iw{i8Z)DIqk>UW>_`P;8FZ@xbT*_Uf8D=Np|+dKHcc-e-zvv&>c ziKZ^kKa^_N^B3-S+jHUfK4^bo{Ckz|_gfk-{>BHjS9aC(g^%uE`qtjo?(2>1$Hz9= z_SLL^efyv8-&)xm-gy;j($62>UzJKVgmxw}Uu~OuaeDPqxa%_wqh4QC=GOw>Id%0@ z<0n4bIJc)EGtpb#yna*W#Hv)om!Ed89nFv2fAB`{Tw6!w;T@^*OJ%E@);D}mwmrkzC+M*J@t_bH4W1x8xeW5{K@-)LWY#$o!uBm&ez9?cJ&d zySCEz!&GyBPq_C>jXwmjH1>i)pyA2;>J8!nc$r&hK9;WN-Y zkPJV!sbSsKJ!fkgs&5?X-LUYp-0t1`ezG_I)P*bU8|L2V`t=YJxSRW)SWJv##bsDrJY)RYI9N z2sMvc;FzvfyT&o=wkE882iAtIiH)vsfAwM0TirU>-3TWtJ%>H8t|kl?xL{~Y%bEl5 zf&*u)xglr`+i7r{d#!M}ZzAki_nWT9@Ye2$L)|cZ#tCmYv8K7(<$&1}1grHSFVux$ zsO`Ru9@`0)chodPZB4i5k;6exV|8#C&c*EJ+)Ge>6l(Htbfi3JhhOYWIcDGby5<^J zaJ*`h<5dE6fB__g;7oU~7A`dAbICuDer>{pGIQy@|%yM(}y6 z;MrLCcyrbHpwnvYbsajq&GCTM2noO4SY|fYG>>&#A*b6_-MYr=-S|wcD_jvYC!ou; zpsYW4w*y-u-dZR41RM@mKn+wwSDoVuZ}e5Ts@5H7oC*TKfq@fs0Wj-poQ+N};)K_? z)mVq1E_9;K)#?O~xEf7JxjbQbG+}oKT`Aw@n#7O`f&+@S}^yxR_j&F~t#tko&2P91gCZueQM*Vmk^D}z0uaIyyM z$GyROf>tXyj_X0#8royS2WfzF$Ta;f5s%MAFmFK7 zUFB{>0wke0>6sN+P{XVJL3<{+sgItl!5S}3sB+sT!UEi7lKa@Q@DEE5c*r)G9+wHL zP3?dOaUH9I1kg#%CmGXHZERrA!GROeLjsmVh2^%&h=eYaq)EedQPU4)YMdJ0Pr$g= zz;dfdA&>>8&omtc>{GHP1d;Bc(YX9j?m|^G1!dZ8l(*DKE65f6AUs)Aoq)UA4FNKa zm$F54Y+2yLt`Ge|H(dc2vr6Ja8Z2GHS`*Yjes;tv}@69J_k`~ zs}0t{6L7r#@Rqp+=&MLJ^hCdrhI5fUO9076xHNefr6BZ$bRrF>BK4aACbyHxx6_yV z0)PE@npH4J06jX|GYWC?HR_3c;>ak3B5h~N%i7*aMM9anp3vKm zCS%d#kqep8ybT*-^U1`!(3U*%b`0(tj>3zbkbJc-f&q$LYmTl8MZBTWcV5CT}c zA@OD?3hDm7WM}M|fi_5DC@)S%E<_TqCyzf2>1bceY;q!cZ)8WV?VLnEwm)i6q2k0fx?uN25KNr!tx3r|MsZXw_&m zIaW8CK`&#W=;WuPyF*)Ncb`{15IXn+|{#b_!?ZOMNaaqL|OOa>yn$Q;B`p0w?%7u;6i5zb3gy(s_1>y zdtZ;WMW2tP`$KK_L_^17Z+tcckJd-vd^%kZUvE1HcSjC2CS#4SBty@Jj$%}=K<8-e z9E3370Fmeyzl28i!HzStTQ0%Fm*E3AXaB(!qz%J;|Filx+Sl#q+l7O=`fb?umuk^g z{U+@Ao?~vv^?VDL(n<6R>f#q*J$Syb1=YU$51lyYaFyVC-Ekrd-Y039K`v+h?K0MI z4BmqqTmER_SAGQWAv|&Yp+)l-1KA2{r@G>KbsLZt0pN@O%liQQGW>5d2Z!!zhFm|m?*(|_^o<|EpVyHuH{QoRvtq~K zBntnxul{TZHkg;O`P3fxR67Bz#?R;X9{fK5(*X02!}sAk<(;&7tO7i&!9HIL_Rbuf zCD|nI2cxU;gj8$su8EW6$zhz3!$`19_oRtJtInK+@m%4j*!*pPP2hZT4V-VbU^kuv z06)RKB{{>Gg7+2s%00XlybE#JALpO7$E9p1X+SsY~iG;!espr z;M)Boa1VxVFgmG`lpA7CMR+;)3ZY}q+YS$;LpEk?DULnyu7#<(u?|G9Ur(cnwZ^( zOQXK^)~L7PGr%w$}V^rs7)0FscOU4ZY zCzOGIbWsP}bZH86iKFC*E=`h9eWfEJqOHWVtWpqIwnUm@P)n5n5QgqbLyIU3rAd9V zttB-QsnEX_x(2~gaY$C;o$sMmDZ@fb8Dv{_H_q#MwjH&g{dq7_&!9mtrq1wCv;A zCuosnPw4fP9k^{>DPM7jG-!3H;Yw4eDoPTkB9pXOu~ZxREU#cyFetDZOzIWG1VQ?$ zRDgm;mUNr6=%bODuPIuQ^pt>xQW7ktRmq+f;Qk7@B8Y~_5SuRT8@r*qN`wj?nv&9X zSzMS{%SEz9McpD66A6n3KrV_+)n!tVT!KiFBwNI|rLA~^t|YafwxAW2j5e)Glpv#; zsT*2^RYgmL$T@|_?vXHGLEoizEBE)NL%?HPjv2ZLFd$(@WW`iI4Ntql&pqWy3f$)@Dd%}6H*DC!>sjJ_)1l8 zThuH`20}8Z_CxiBLZS+!q)-GT**|M9mS>qPy^q|VO7E?q7Ze(=n9rn zW9=eeVXok)uBK^mQfEayj-momOQp4Wq%e_v5M4#rDjMO=>&EJE{le=exN_#ux>NH+x97PAYdck!S}sxKbmqJ!|{83_~OR)8rU}6_@du10zR+T2VP~z?els94!*y| zEjD1=ZpU-YH( z$OxE*<@Q=8zU{RQ%e3t>v=45#gC61#9Y6YpU^pnZ-(z~+mL2dqhJo${@ZAB1!jCUJ ze14R4ye=O`%x^mm4%o5+7W#+~9(2)gY(RsK&+GF#0h>mRLXOwtc1Xl<+a3eIpmbX< z3?P|yc^seNHV_=O0)Qy!yzR9u!-XNS1BM^JGr+O9alk$tyq5&5fa%9^2fPSJ>GPTa z{1(CXf#EYuOoYd0_&t~>+mC_vIc~#r+kT7^cKzss+w1XJ9*?IC1@UE#NeT2}hEWYa zZjbH5So<*$m~sPOBzfGnVPJ3!jF0UHpX0IIKFogr=Y`MoTLH`-0tOJ<@t8Pg+%z%# zsDcv%5XQ!D0Rkot5TOy#i(kVS^xcMyv2c8VPWgP67vmeiZ-Oup_)U#6M$TK9dJLl9 z^amVF$(y)j)fw zj^A6Mw97C^3(b-^36cm-5d4r+h(ZL-_ z&<76|iW!$A(kl9#~DE-ktlp<4#K+}S{I!Bs;1VTg^!o!l>ATj(<#1x$>i{g$J zby|s_oON4wUO{$;`xjn8^rqrp_fkP0`f_a3sj3gyl|xvcv~>I;jZRv+ZaH?Tr#?~^ zrL-*aR@A9Qm&Q;mCC)U$p??~Jj44@N71gLgJT0xHNUliIo|24$lR}{=DP1Top-@BX z=ni+aJwnz($|7SUP5mRYY?q~op*~nJu&#@i$PfJy3}s!zkQ8ZBrSz1nbf{Kz(jX4k zN`=%kSL^5wt0)OlQKm^i8(|Hq6oCzsB*7^Q#Nw`MQb!yhdXdQJq)0?n>q6Qjs{XJL znvzgBD@5XvQAXh=_U^oD9DEt~i@sF&*RLAw7WuC(<+Zq8q)`BPs5}@oA{tQ1$h58s zg&HF<@zR6mih$_SWW`n>B&T@9&|Iph!(xO217E8l0qCZ zS9DA3F(6AQwo)Q7YAAcuCTHkRQxq2Qi6SH=ru)h*_KXcFDHb7A)hx;>igHGA7*;_m zS?#CtLfO<(Vw)97ix~K!iYy9A)yU8^&0GQ2vYJbpN}VA}#wh9vnV|@91(g+vq!|iI z^^|YgCxj$aGN3;!iyTR2fZINEColNCIG!MAp3`Y_XwKlo1kI=_wCTs))ii2vobLAT^&fQ9+4o z6;e|?g{Z)a-e_U6MG{o#3u{GDFig}q#1SFgP;@O%3}s9;*r*~AU$Mx95@9)RvnEv~ z5XwDmk<{66u~~$oFNW^)5-!+LUb<>#7U|oYfB74pa=95>Fcb7rAVDt$67*6afo~AZ z1inHr6Zj6nOyEldGl6dr%mlthuoCnh0ZY`u5|yw-Ei6$D%h;`N6!zcx{j_|yJUuvZ zn!a2*-%J`}re}g?UI6Z^k4llTJGm}$`__Cf_!&CU9Z1o-Qu==9c)sJ6j<9{AP z-$2WMB+@@VJTs`il0JdI04# Vv7d?qR2-xtMa3a3hF@K5`2Vhn;nV;C literal 0 HcmV?d00001 diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq b/test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq new file mode 100644 index 0000000000000000000000000000000000000000..fe69f0e09f0ae755e168d8f9ea690b4396f21fd7 GIT binary patch literal 20170 zcmc(n31CxI+V}6tO+u13Bx##ArG=y^O#>xu(!DH_q)iuS+NLz!*t%~tZRy^!G-cnZ zD2f{vkVOPVR0Kq=Ac!amu8cTNXU1W4MxFZV`1-x$`@P@)+!Ur@8OIObndtA{d(J(} zbDnd~bDncwIqd>@99Oe74$80p2NWR&j^3|8U7G}5?qTn1{yBBoE8$RAiNmh;_a`qGAr*@26u1Q)jjmw^T(GfNBkOtM(c zFw!Q*Elp(lrE-2rxmd$D&!GwDqsBC(lS_~AjZG+Spbaqv^py+I{fYw$ku--HkT&rL z=85%;Mr!6WPNZWpXqtRn7!6HmC_!Cret)G{#dps_45Jug7NFum={i2M2oI=&ksH#P z5=g*I7i*XjgP!R(=@~n+Fcne}MAbA)<%o4Fb{IM#ayq}HPORs9#8#vYj$ZEvq^~Bst~mf zqB0dT;85AkP;w#KjJ%FPLpq-+7GGgBiX0{dRWU9z-`IwnI@I5U`z5J}&-jT|OuuU# zGiWH`GYkSodmo>PM|!h#9dm^Co2!t?LEmYh5GF}%4hjD=o1@ZWl_Pf+|X&-T`|eV^$nsW@0cflTlzWq%uQU%w`qi z-uyihT!CO3zNSHJM6-^ABPe?Sa?o!5j6dMS3t9yOEtQ~R_u)e3613vBmmon2(`PWF zUviis1L{V}Irag>>-mgc+{9;8sP7Ud>h}=82F34S+$NPGT_I;CicuTH<4Hy${g@zHK;66hqpGkn``yoodOT-M}rADep15mDX5Gpxw8TzrG~B#EoI*a38v+57p5wLcc+o{U~J+gSQ_IDY5r6=FOiW zUWJl}Ff1=|%{6b+i4yrt^rS*Wu48gcdI)Alzqz34*1i58{Px}lVrJ|$v2B0Z{Q6nY3XfWGQ?4MJfk!L);^5o0Ezd2XqO zOJznAk6^f(4^so6;+IsTd)&y>#0?*)9-SU5))``x{)$VXo| z(BHdq3{R8NXY92KyaJ3BmyYx%kCCYp-J*~({(2u>R)6}gGQmdw$OQxIOqO%r{MfoH z$9t1)W5-WvpEr2<%xf}@WtDeG?yI7;q3PT7MW-KIe@}SM?(_$&dpB&4C_Gy8^^w)H z113eMw0`{`0^ts$=+#Xj&sdWmP*ac-EN`&ijZcd0}-#eJak)p%R4PrdKzvhGder>zzi zMOWtBXFBsv{f=jTO7wRz zt9%o-1V7*Gc=M6aoJ{4()<2~Ve*N^8U#|}%JD!aYTpYix7`uU-JGz@3(mSu7#$6|2y>u-f{x9_ZEOUC_gY)&uGc6O>nWfNcDrptcIU1;ZD z$ZI>W?wh{chc9e5P5B@DyPxgfJ3Hdkg8LT#;kOskcbF8ffBK5+qcabD?sM5Y$G6g7 zcsFxplYhyl_H~If{_w>g*FNypBmSM=JvOjnTlRL+Q}$niSbb{Uk#X&%|GDXM>aKA- zY23=b1DR{YyMEx$c`seMCrzrgmmleUYFxy*N74_re*WB|`Ff-4;MGdy<$z3}+a41!^p>NYZox}3nLFcl=2j*3-ihJUT-f=tXY>CllwjEZ? z+cj966Y&h-JlmF)Dt?4$kf6YW) z-#FblHuz^uwoyH-<5J~mX{Pc?mh-Sa7JfUIJ%EarfLQQ2SOfe(2Uq|?z%oz;U~m{4 zXax(wM$iZ*gKZ!Z2*51x6tIGEpdUnlH^H;uWv~il0P4Yp0r<4cSnx2Q`bYq;f?QAx zvOpzx8Uz9b*bams4Zv10AAo9b3CsiD09KQ6g8RXX-~^}xlRy!81w?~BFdh^DF?b0~ z0H?r%Kn)~-TCbhpF|ZC?1VKOwUI6pK9Pk!+2c(0!zy#p^Fl&JbJPJ00=Ybt`f+s)} z@CWhW4A>8PfEFZy8n7D3feR#qQZNm80bfuKW&jtH=N1}1_%zzpsI=fIO70PF*2K|a_8G~hUR4Ge(u;3&8kECq?+GUx_}Kq#03P6Iv| z1Kt2#pad)jo51@Z1uORU*<%n?6Hu z*G#iys;?!?nw(-w6=>3Qvh;uqValx8nRZcDwj|j%C&@82H$@#BImbIWPhgEyPxW%< zdxaMircBWIl2D<>m>gXcP@FqgZ48(#MaRm*z<-A%noD=Gq3t`hnAWz{Dn2dwT{6UkGp&C-Odt6krtRjYkBtlcQt zw0XB4sZbF zEM25q!KTUfJ)^65wob?{-0J&mLPJ9KbCP4vYvwHG9cQOTo)DhMQst`Za>>aT0Bgh^s0&KKvrM7Io?t?RkS24QYI2;0}@_*DWIz7QU5WHmv#HdK-Vi?Sv|RRLD4A( zd}S>G>Ky^6&uDnBdaKTgd>_qfCn3IqU}Z|$x!0DQIB`Cw>A>q3rUs-5-`#{kW82W}ej2p#_?p15Bl6Pb6it(&n`vUz> z2I{_w*KL?Y_x>YC{;2EI1f@0;=NI&VLzYM2dJ=_Lk}>06%i2j0z-iNvUg39nC* zp%cTih^G`d^YUyCfzr?ENbxh9on6d5FnVC?Kd`ibKih z2GT~Jo5Xsw_b1l>J65+YMRy^YJ2~N+I;!68BT6^PR0eaJn=SWfP5{LTm>3WX;y^q| z0EtAvPvQ{+V-yG(lR#*U=22b8aMxHU`b~lI1HmndPVy-F8t>amGOTHSPfK#!Zqwlp z6AJe@O=nv=K3pqY{o%Z2`#ZWqTQ-E%tk8ZmdBENIc*{!fkdDcsIg2+`?oBvjGKC9P;k12TN;Qq(E=Fe|gyL5fKqqAgnzaf5=uAqC>{*r-!gCF*;?anP? zTNB=S`mO>X>?EX_J0!)$YUI z&Fccn3aUTe-CfvSGHIPL+|j>m+-v)%b}5#%PH~2e>FDlHs7&aX@^JxOT6AAzj~o?o z!>4Iu{Ev>&eUza)na+(0u3ttS6*ziX2A4=Hz$??mdlyv@Z%B4zZ8A@%udQAG= zsf(d6cAYt8pKJ8K>fQN2=UqWBEDlPlzj`h3$ZM`^`!;_Q|K`ybk+<{X7sK2|2GA?w;Z}_IX}@w z{UYT-QKoVokL#sA{XaHm6rfI57NCw>G*|)}K^>s;S_XWg-VmDgvwo+WFbTrpw!CLp%Vkbe098V zy+A9NB-65!Sy_Z&N>~^%s3O@YUo9&zs#(2EXd(i=EP@fKqxEczK^@Bz8lvTKvUs^E z!7G4OMkdB3vMParRkK1iX(~KV_AWg;Q=O2+nheCu`pPV7gT@+b(5sVW!TnrvVk%+8x@^hqGzYp#?RJk>)1Gh4=Z5hY+!_5Xy93W zgoUaZ0gJRO%g(4*Mz~m6gFw&IYisp}{0xCQg^j3IMhH!MZKE(i#)b(K4Q!K$cxi=U zR&RkQNyuvJMe`R=W7W-xq$M^?&jz#_>|U%spiLl=S@ovu*{sZDGesEGL=+}h$-ScB zIJUE127@V4*dY`~*yQ$%T0QTuR$gwHa0*>OsqOk)+6(ZVzUK$L6Q$ke3!EB!ZK}Yucjq#EYTWd zhUp~Aq>qw^NrbaIt?Cu|f|U_bY;(0iCtyiOymqNpZDRF$p@L;GD=2BY)Io@AhBGP7~$%+k7P}9V%fV0sTtW zbNsF+y1y~3urFHEtNG0TZ$)W4kEH~E^~LJ(GY%Cl*}C=e?)Be(+UmFA`I*6AE|{}& zP2durvc(TIEU&G3;*;0EiF(5n+C6_T^SArHyY%VKqdPySDOjIhefj*pV??*9apYo= z=xN@QJpbc7-8z@k@7a zPxUUW=>Pudy;p|DUb|Z^J6DpVj@c8(KQs38t$PBe&8atSOKH@sdhnvx=h6$4CN-;f zXTEYe`h3co(+|H|Czs?;c1&OdAwu0i^T?JZd0KXa=YNu?JK3aL)ASQ9J5H1@!GDe9 zR>65ga|ybF+FUv=sPuHr;RCuZ()EJ}=z2oeH)?z7+C$e1y8h60kFF`y&eL^_+EKce z(RGoog|Mez2$kOnZ+3F{G|LTpdW`+{x$CZupoej5gqnU;u81jjLz;eOm6d4vi71e9 zM++G<&KyJ{hP}jSNWZ^cBVf!WYf)GYfUNkGpc2w`;|#Ovt=ci}cl&TskD%=UpQ%aL z&?HzX2=f?iC$rMc!@?+`ZI$OR?gQTAg!mw)VpR;va4#OPU~S8?lxUH7a5{6olf_w= znOpBfQkp5mvzA>VBDY1(lppvlJ%BRG=|U+{iTH!4jmlDrEY=b|fkxdVCwZL)vTQ+> z{Q~zM>TNi@na`NP0i1%6OHk|(bV|&qNCVZ+b_(b+Om&Z-s>B`7)QI?F%te*7Tlfs$ z&q3~mEaqO`z~o6R!+LKvn5 zCA*85XBAjsjnw7oC7aM>y1Wt;J)kwZd2WW!!+>y`35FobL4PABqx)d#-)#tMF&RB$ zag*5pVXVE?aqm0Ouo~B&GAIKi*`Ej0{>XC z#YVsfTWn_vcE`bl&3BF?GtJQ`CSOmKd6n3L+wz@xWCo)o)e{{~6%pj1SQUxq+nhvh zR^%y_NP@EQPo5KsIlt;=#SEXHA7lNWWp(fL65YMMgj3D!_t94*%HK&cl^=NA@k4@d zpWtC6a;{|=?igj9S)rzhx_8voy%*5=ya>1eb%m)rNnK9rzS8MNU1#dD+Cd$l?q3C< z?kZI%HBAoz9l%WG%vA*L)WxK3XeO8lI>07CO`?r%Sc0)3zn>;x8|1=Qu- z4(mK;3S-q)Z0|ARX{QIamy~g4LiN zB!X}-AB+KyfLd@DpxdDFU_EF8@t_(A!9H*ga00q52nHd*3c|o{kPPTnA{X2Z9t6w4 z0T2jgfi<8T+z)QERiG-lv9yhH-pt^{nZc1WD`CC7#nQ1vM&zu*$CvM?RQLz5bjQF3 zu|7C=hF7l<)01a*Y?wk99;D~7lNjM-mQ_TsQw$+U#mXajQ9Rb45gXxLn&9xV(Y%sU%@8xhz+)^5BAXTEqlhA8b|y)-vsvOOThb&Sevl6vl+ByT zvN>i4k7d0vT!gHKRj|2pSV)u?E@2a>tU*pDzd*vu8K1(;BA=PrjNCFITpnGlAVC_& zW|7RD$L5fDMbv!OM^OOFv|wQnTjI+qO2;x#5K~qjR3T?7<&v3I6$EmVYLhQp6Rrqi zP?fxvWi9sE{&hY?!j`aX(V}`boUjTPBWX}HvePCJxlv5`4Q$h5Y!le@oG7zAh~+IB ztMDbwEk3LQs%$mDazwQ$H0>-eXl$}w(GetNIs?Xbb@P@iWydOz&!#YhN7Jn*36AwJv2DjxO;dL*2+`O4Ax@jLlJ3j!Z)BR)a> z3~!O#XHJlO7umf>&K8Rq_Awq$%#O`DEM<__d0fqc=c^=eykoA?2Nvs?i$Hpy) z+7JA6k39@)fBnE2%i!>d^ErS2kNkCS^%32L)tt5Gj~M-jI#2klHM2yZ#KQT|WuvoTl@89-ua$ z3Oojmg2~_tpti#YoB}5RwQCaqT_f_rVLX%mTh37AyplKrg5UAAnZ?AIL!|Xat{u zz2JRt-A2#A^H$&w9s$&rrh~6QGgt?T!3*GdAOkE=f@MGtV!#q$1B-wO&^03yxWE8- z3%mqEz!ney=$dd5c!N*CbKo`b5O^6J0NcT@z*z7gSPlxnZ^4J)8Ylw^zzwp%MsPQn z4pxA6paW8H4(tOR;2p3AtOiejAg}>+g9`8m&;&jL`@!$P9uNvt;1D) zfkc^82;=^2v#Th<&}uUL_+Af z86k8b#5(AwWN~grNZ>dU6imh=euA2?n(=fY3?sx_N5TowPnt|3$P^r*5Q49;Cfq`3 zS~fa{#Kw_$ghN9^5)yGISntR*;q)1I%_PK3EY{?d5Szdlo0>+_kxQI0Yc?U7#JA5L zl9f$J4sj48$t830NT?IJz4HkvAcaIniz><@!lCKKb7SX``D8&zC|OuiNb~zyx zmD6UDDm@isTJ`uEQp?sYLY8`0LnG0Ki1FC8m^Awl%h2=|1nAdULOKcQ>LyFb(q%?M zmXjVrd{Cvkmw2xTB`cAPtRmTDbqmQSYiOust&VIM%3ViDybKY`L$VizAi9UIBkMyq zY{cUxLN=4}TecE+>2lJC9E99Wwvl_ty*=CSTfUr-`$Kjh*+$x)2WC9@5P5h?NC=nKdGrX@Hi14e8NqZLqPK(5^|V?RXj?_YI!;c6kfH38FPwUj@C_k^y!7$_dF3=YLrmiJ*$6LxCoL&&8N9l30NC*;tjcL~}1 z-uvW(kPjQgW&?G0vCT#OSp)semj)lC zXJwXNq3%}$Yp`Ma^o5&ubHiW1JlovC6GV6Dgl9K*yYn($;1BXRoV~P^*HpSnl@l1H zcNmZUQF<7Y84M*VC0sC?QS!Klm@xm$L%t#(`pT0s_IgTBFz)7H{PmzGd%};hCtgqB z$)0pQNYm4UiCZ$zt^D;QGzY@b*TchEMv6Eb75n2-uFHu(y$=iy{6TgHf0X~f{1<7z zz#k3V(U(-f7kskEV@v_VHo2#KkJ5@7xiFOg<@T6@nZK$r) zPSI!^N?nzRXlSdiN2sN>w7C^QBlq0cgvaamwRc&Q` z#loB8Jc2E3?P{VGYTFuW%Nr{^GPJpxx^BE^EUT+5Z>6%iDqBk{N?S{je2KQQ0m*9{ zs%c(lUaBEcyKICoH%p*?hg+|;M2{j17k7&+RrQUft?{uxmSe<=5h|fw zc5_8Bqg2Gj-BMB8^@gD>BNTrl`X5%p$#;upxwu;@xvrx-&}__4X*O<@26J)0yey{J zpAsg16k)iyUtXBRpOPhE6j`{qTV%P><^Mk;ftwq*j07(3mP&qsk?@mqENPSmb8-I$ zSxm-Jgz-e)BF!DC-!!U?<05aV=3mhPrqQgNC-Rq<<|hqh&qyhXvnL}@Lzc0)&_dsLR^=O8GLmQ2 zl|^@&$_nDDXdT6cHTC6PNT1pYd461ev8%qmEG@UH%$3?&JO}C0;!I^}9kiT|BIGYB z%&MwLukYaACC@UIx$04V?sa)hWrfzb!kFB+@-!;9rM4`3*5cxVETgAAruvGsRFsih zL)(gW*O$4n+Kbbm7b;Woto(diQEPVctfX6H@22SsVp?i%$dx?HYM+xq<#Lr4S7&=sp(8rWBNxVmsiQJ~WZQ~c z%A!+^S#=JsjIx4!V^Kjabz#1C6&KL?W;K>K&xpm{88@JiFAGdM;hQ9wf_OsLOnd_t4 z{^56{>D@HKF8$bkHsm*zH8^5DLLJMCS!wO!ZW{foxH?bj}Qp6A_YW8jxxLv5b*#xG5qdd+n=|e)5E?g z#wh%gq^@Cqu%)zMR##EstT@h(Yp}-QxvMP3W6vvG`CaAF_3dS~)K|=i&aAVv+2_o{ zxQ1VRV~!Qqmp5cJl|{#L{-=FTY*%KTy|&6>Nu#d#^>ld5q2YFm-C{|vPQjxQ?z<(1 zy6+hl>biGOH{MFanbmOPtu55o$A8v`oYd)0Zp6h>MK9Pn;D;9$%c7j_s3^J}z#k^k za5lYIxlIE1o(3(nPK(uQ&vROw?J=J878{M^DyDb5PmIcmisBv}bU$G=(r{YNaC%1u zSE#3I?vYDM+hDQTIj8I%!f><*Q0OGpyr0|uvoZdTrY39 zKQdBKe$=92d56c7mD4wOxv>>TUXOexPhpL8(xUKci?xI9(}#b#+}$&juE$wYs&Cdu zN)Zj`V9t!3k6czy&Jpv`qi=dBfZuF7UokikS$_RI0MVZLIYLw;N**ykEiE)`yiuP= z|DH)OVt#ts*Ui;rDWdZoazLkCeOv%~M|-O+$jbGD!*hs0lh{%`^OuYNk^FXAwuP#U z%a1$vOQ!h|KzoX)9_=)Ye!ec$Khqzbo}wB(ljhd`@XU9VkEsM}>ghx-$MAgTn!w4+ usMJlR&5PSATMgw+O$P2BkIqv1|;@L3t((Es^=nXUNyogDmM?EW8!bfpmh literal 0 HcmV?d00001 diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/description.xml b/test/data/openml_cache/org/openml/www/datasets/40981/description.xml new file mode 100644 index 00000000..70843ade --- /dev/null +++ b/test/data/openml_cache/org/openml/www/datasets/40981/description.xml @@ -0,0 +1,49 @@ + + 40981 + Australian + 4 + **Author**: Confidential. Donated by Ross Quinlan +**Source**: [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html), [UCI](https://archive.ics.uci.edu/ml/datasets/Statlog+(Australian+Credit+Approval)) - 1987 +**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html) + +**Important note:** This dataset is derived from [credit-approval](https://www.openml.org/d/29), even though both datasets exist individually on UCI. In this version, missing values were filled in (not clear how) and a duplicate feature was removed. + +**Australian Credit Approval**. This is the famous Australian Credit Approval dataset, originating from the StatLog project. It concerns credit card applications. All attribute names and values have been changed to meaningless symbols to protect the confidentiality of the data. + +This dataset was retrieved 2014-11-14 from the UCI site and converted to the ARFF format. + +__Major changes w.r.t. version 3: dataset from UCI that matches description and data types__ + + +### Feature information + +There are 6 numerical and 8 categorical attributes, all normalized to [-1,1]. The original formatting was as follows: + +A1: 0,1 CATEGORICAL (formerly: a,b) +A2: continuous. +A3: continuous. +A4: 1,2,3 CATEGORICAL (formerly: p,g,gg) +A5: 1, 2,3,4,5, 6,7,8,9,10,11,12,13,14 CATEGORICAL (formerly: ff,d,i,k,j,aa,m,c,w, e, q, r,cc, x) +A6: 1, 2,3, 4,5,6,7,8,9 CATEGORICAL (formerly: ff,dd,j,bb,v,n,o,h,z) +A7: continuous. +A8: 1, 0 CATEGORICAL (formerly: t, f) +A9: 1, 0 CATEGORICAL (formerly: t, f) +A10: continuous. +A11: 1, 0 CATEGORICAL (formerly t, f) +A12: 1, 2, 3 CATEGORICAL (formerly: s, g, p) +A13: continuous. +A14: continuous. +A15: 1,2 class attribute (formerly: +,-) + +### Relevant Papers + +Ross Quinlan. "Simplifying decision trees", Int J Man-Machine Studies 27, Dec 1987, pp. 221-234. + +Ross Quinlan. "C4.5: Programs for Machine Learning", Morgan Kaufmann, Oct 1992 + 2 + ARFF + 2017-12-04T22:15:38 + Public https://api.openml.org/data/v1/download/18151910/Australian.arff + http://openml1.win.tue.nl/dataset40981/dataset_40981.pq 18151910 A15 4 derivedOpenML100study_135study_144study_218study_98 public https://archive.ics.uci.edu/ml/datasets/Statlog+(Australian+Credit+Approval) http://openml1.win.tue.nl/dataset40981/dataset_40981.pq active + 2018-10-04 07:20:02 920e2419a28215109651fcc5cbd1662e + diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/features.xml b/test/data/openml_cache/org/openml/www/datasets/40981/features.xml new file mode 100644 index 00000000..ba431ff5 --- /dev/null +++ b/test/data/openml_cache/org/openml/www/datasets/40981/features.xml @@ -0,0 +1,175 @@ + + + 0 + A1 + nominal + 0 + 1 + false + false + false + 0 + + + 1 + A2 + numeric + false + false + false + 0 + + + 2 + A3 + numeric + false + false + false + 0 + + + 3 + A4 + nominal + 1 + 2 + 3 + false + false + false + 0 + + + 4 + A5 + nominal + 1 + 10 + 11 + 12 + 13 + 14 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + false + false + false + 0 + + + 5 + A6 + nominal + 1 + 2 + 3 + 4 + 5 + 7 + 8 + 9 + false + false + false + 0 + + + 6 + A7 + numeric + false + false + false + 0 + + + 7 + A8 + nominal + 0 + 1 + false + false + false + 0 + + + 8 + A9 + nominal + 0 + 1 + false + false + false + 0 + + + 9 + A10 + numeric + false + false + false + 0 + + + 10 + A11 + nominal + 0 + 1 + false + false + false + 0 + + + 11 + A12 + nominal + 1 + 2 + 3 + false + false + false + 0 + + + 12 + A13 + numeric + false + false + false + 0 + + + 13 + A14 + numeric + false + false + false + 0 + + + 14 + A15 + nominal + 0 + 1 + true + false + false + 0 + + diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl b/test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a865af56b27c6a868c9e6806a8fc73bb21606bef GIT binary patch literal 899 zcmaKqJx{|h5QZBkZJMSHVqgPiilD;ud#r?5%EvE2QA!Pt)VNiXpfVuA?hW&Qx!4I9 zkv7srmi@f6!?>e-FpEH&<|a zo$q~$_J;s5uZC<+HUg4ZEQkO`6D+usW%85P6>QjS8HFrZ2=3)8|0P(YBo+^#H!cJ> zups<^rO}M6{V0s%kUba7w35!I1~ru(t-;y`k4r%v$ec!G9X<-`-qAftB8^0n>~0}V zjWi|MQ)FH(b)h`!apMvS>WIF*@qZF z3?W7k;~$Tz=24X%gEEgS6>ZZ~dPdJ_rzo(diEC0k+|mJCBW+2JN^DxxD^k}rL0t;Q zy9JIWa6n*ApeyAM*U-?0Lh Path: - return cls.get_project_root().joinpath('test/data') diff --git a/test/general_checks.py b/test/general_checks.py deleted file mode 100644 index a1d8610d..00000000 --- a/test/general_checks.py +++ /dev/null @@ -1,25 +0,0 @@ -from pathlib import Path -from typing import Union - -from meta_automl.data_preparation.dataset import Dataset, DatasetCache -from test.constants import CACHED_DATASETS -from test.data_manager import TestDataManager - - -def assert_file_unmodified_during_test(path: Path, test_start_timestamp: float): - assert path.stat().st_mtime < test_start_timestamp, f'The file should not be modified during the test: ' \ - f'"{path.relative_to(TestDataManager.get_project_root())}".' - - -def assert_cache_file_exists(path: Path): - assert path.exists(), 'Cache not found at the path: ' \ - f'"{path.relative_to(TestDataManager.get_project_root())}".' - - -def check_dataset_and_cache(dataset_or_cache: Union[Dataset, DatasetCache], desired_name: str, desired_path: Path, - test_start_time: float): - assert dataset_or_cache.name == desired_name - assert dataset_or_cache.cache_path == desired_path - assert_cache_file_exists(desired_path) - if desired_name in CACHED_DATASETS: - assert_file_unmodified_during_test(desired_path, test_start_time) diff --git a/test/unit/datasets/__init__.py b/test/unit/datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/unit/datasets/conftest.py b/test/unit/datasets/conftest.py new file mode 100644 index 00000000..bd43ec3e --- /dev/null +++ b/test/unit/datasets/conftest.py @@ -0,0 +1,18 @@ +import shutil + +import pytest + +from meta_automl.data_preparation.dataset import OpenMLDataset +from meta_automl.data_preparation.file_system import get_dataset_cache_path_by_id +from test.constants import OPENML_CACHED_DATASETS, OPENML_DATASET_IDS_TO_LOAD + + +@pytest.fixture +def openml_dataset_ids(): + ids = OPENML_DATASET_IDS_TO_LOAD + yield ids + for dataset_id in ids: + if dataset_id in OPENML_CACHED_DATASETS: + continue + cache_path = get_dataset_cache_path_by_id(OpenMLDataset, dataset_id) + shutil.rmtree(cache_path, ignore_errors=True) diff --git a/test/unit/datasets/general_checks.py b/test/unit/datasets/general_checks.py new file mode 100644 index 00000000..5e2f446d --- /dev/null +++ b/test/unit/datasets/general_checks.py @@ -0,0 +1,24 @@ +from pathlib import Path + +import test.constants +from meta_automl.data_preparation.dataset import DatasetBase +from meta_automl.data_preparation.file_system import get_project_root +from meta_automl.data_preparation.file_system import get_dataset_cache_path + + +def assert_file_unmodified_during_test(path: Path): + failure_message = ('The file should not be modified during the test: ' + f'"{path.relative_to(get_project_root())}".') + assert path.stat().st_mtime < test.constants.TEST_START_TIMESTAMP, failure_message + + +def assert_cache_file_exists(path: Path): + assert path.exists(), 'Cache not found at the path: ' \ + f'"{path.relative_to(get_project_root())}".' + + +def check_dataset_cache(dataset: DatasetBase): + cache_path = get_dataset_cache_path(dataset) + assert_cache_file_exists(cache_path) + if dataset.id_ in test.constants.OPENML_CACHED_DATASETS: + assert_file_unmodified_during_test(cache_path) diff --git a/test/unit/datasets/test_custom_dataset.py b/test/unit/datasets/test_custom_dataset.py new file mode 100644 index 00000000..5f34b194 --- /dev/null +++ b/test/unit/datasets/test_custom_dataset.py @@ -0,0 +1,48 @@ +import numpy as np +import pytest + +from meta_automl.data_preparation.dataset import DataNotFoundError, CustomDataset, DatasetData +from test.unit.datasets.general_checks import assert_cache_file_exists + + +@pytest.fixture(scope='module') +def new_dataset_data(): + dataset_data = DatasetData( + x=np.array([['a', 'b'], ['b', 'a']]), + y=np.array([5, 10]), + categorical_indicator=[True, True], + attribute_names=['foo', 'bar'] + ) + return dataset_data + + +@pytest.fixture(scope='module') +def new_dataset(new_dataset_data): + dataset = CustomDataset(42) + dataset.dump_data(new_dataset_data) + yield dataset + dataset.cache_path.unlink() + + +def test_error_on_missing_dataset_cache(): + with pytest.raises(DataNotFoundError): + CustomDataset('random_missing_dataset').get_data() + + +def test_custom_dataset_dumping(new_dataset): + # Act + cache_path = new_dataset.cache_path + # Assert + assert_cache_file_exists(cache_path) + + +def test_custom_dataset_data_loading(new_dataset_data, new_dataset): + # Act + correct_data = new_dataset_data + dataset = new_dataset + data = dataset.get_data() + # Assert + assert np.all(np.equal(data.x, correct_data.x)) + assert np.all(np.equal(data.y, correct_data.y)) + assert data.categorical_indicator == correct_data.categorical_indicator + assert data.attribute_names == correct_data.attribute_names diff --git a/test/unit/datasets/test_datasets_loaders.py b/test/unit/datasets/test_datasets_loaders.py new file mode 100644 index 00000000..0fd1ce17 --- /dev/null +++ b/test/unit/datasets/test_datasets_loaders.py @@ -0,0 +1,24 @@ +from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader +from test.unit.datasets.general_checks import check_dataset_cache + + +def test_group_load_new_datasets(openml_dataset_ids): + loader = OpenMLDatasetsLoader() + datasets = loader.load(openml_dataset_ids) + assert loader.dataset_ids == openml_dataset_ids + for dataset_id, dataset in zip(openml_dataset_ids, datasets): + check_dataset_cache(dataset) + + +def test_load_single(openml_dataset_ids): + loader = OpenMLDatasetsLoader() + for dataset_id in openml_dataset_ids: + dataset = loader.load_single(dataset_id) + check_dataset_cache(dataset) + + +def test_load_new_datasets_on_demand(openml_dataset_ids): + loader = OpenMLDatasetsLoader() + for dataset_id in openml_dataset_ids: + dataset = loader.load_single(dataset_id) + check_dataset_cache(dataset) diff --git a/test/unit/datasets/test_file_dataset.py b/test/unit/datasets/test_file_dataset.py new file mode 100644 index 00000000..125cb641 --- /dev/null +++ b/test/unit/datasets/test_file_dataset.py @@ -0,0 +1,48 @@ +import numpy as np +import pytest + +from meta_automl.data_preparation.dataset import CacheNotFoundError, FileDataset, DatasetData +from test.unit.datasets.general_checks import assert_cache_file_exists + + +@pytest.fixture(scope='module') +def new_dataset_data(): + dataset_data = DatasetData( + x=np.array([['a', 'b'], ['b', 'a']]), + y=np.array([5, 10]), + categorical_indicator=[True, True], + attribute_names=['foo', 'bar'] + ) + return dataset_data + + +@pytest.fixture(scope='module') +def new_dataset(new_dataset_data): + dataset = FileDataset(42) + dataset.dump_data(new_dataset_data) + yield dataset + dataset.cache_path.unlink() + + +def test_error_on_missing_dataset_cache(): + with pytest.raises(CacheNotFoundError): + FileDataset('random_missing_dataset').get_data() + + +def test_file_dataset_dumping(new_dataset): + # Act + cache_path = new_dataset.cache_path + # Assert + assert_cache_file_exists(cache_path) + + +def test_file_dataset_data_loading(new_dataset_data, new_dataset): + # Act + correct_data = new_dataset_data + dataset = new_dataset + data = dataset.get_data() + # Assert + assert np.all(np.equal(data.x, correct_data.x)) + assert np.all(np.equal(data.y, correct_data.y)) + assert data.categorical_indicator == correct_data.categorical_indicator + assert data.attribute_names == correct_data.attribute_names diff --git a/test/unit/datasets/test_openml_dataset.py b/test/unit/datasets/test_openml_dataset.py new file mode 100644 index 00000000..81042648 --- /dev/null +++ b/test/unit/datasets/test_openml_dataset.py @@ -0,0 +1,27 @@ +from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData +from meta_automl.data_preparation.file_system import get_dataset_cache_path_by_id +from test.constants import OPENML_CACHED_DATASETS +from test.unit.datasets.general_checks import check_dataset_cache + + +def test_openml_dataset_creation(openml_dataset_ids): + for dataset_id in openml_dataset_ids: + dataset = OpenMLDataset(dataset_id) + + assert dataset.id_ == dataset_id + + +def test_openml_dataset_is_cached_cached(openml_dataset_ids): + for dataset_id in openml_dataset_ids: + cache_path = get_dataset_cache_path_by_id(OpenMLDataset, dataset_id) + + is_exist = dataset_id in OPENML_CACHED_DATASETS + assert is_exist == cache_path.exists() + + +def test_openml_dataset_data_loading(openml_dataset_ids): + for dataset_id in openml_dataset_ids: + dataset = OpenMLDataset(dataset_id) + dataset_data = dataset.get_data() + assert isinstance(dataset_data, DatasetData) + check_dataset_cache(dataset) diff --git a/test/unit/test_dataset.py b/test/unit/test_dataset.py deleted file mode 100644 index 3ac46d6d..00000000 --- a/test/unit/test_dataset.py +++ /dev/null @@ -1,40 +0,0 @@ -import numpy as np -import pytest - -from meta_automl.data_preparation.dataset import DatasetCache, NoCacheError -from test.constants import CACHED_DATASETS -from test.data_manager import TestDataManager - - -@pytest.fixture -def dumped_cache_path(): - path = TestDataManager.get_dataset_cache_path('data_dumped') - yield path - path.unlink() - - -def test_dataset_caching(dumped_cache_path): - dataset_name = CACHED_DATASETS[0] - - cache_path = TestDataManager.get_dataset_cache_path(dataset_name) - - dataset_cache = DatasetCache(dataset_name, cache_path) - dataset = dataset_cache.from_cache() - dumped_cache = dataset.dump_to_cache(dumped_cache_path) - reloaded_dataset = dumped_cache.from_cache() - # Check data integrity. - assert dataset.name == dataset_name - assert reloaded_dataset.name == dataset_name - assert dataset.id == reloaded_dataset.id - assert np.all(np.equal(dataset.x, reloaded_dataset.x)) - assert np.all(np.equal(dataset.y, reloaded_dataset.y)) - # Check caching integrity. - assert dataset_cache.cache_path == cache_path - assert dataset.cache_path == cache_path - assert dumped_cache.cache_path == dumped_cache_path - assert reloaded_dataset.cache_path == dumped_cache_path - - -def test_error_on_missing_dataset_cache(): - with pytest.raises(NoCacheError): - DatasetCache('random_missing_cache').from_cache() diff --git a/test/unit/test_datasets_loaders.py b/test/unit/test_datasets_loaders.py deleted file mode 100644 index 1596e312..00000000 --- a/test/unit/test_datasets_loaders.py +++ /dev/null @@ -1,50 +0,0 @@ -import time - -import pytest - -from meta_automl.data_preparation.dataset import DatasetCache -from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader -from test.general_checks import check_dataset_and_cache -from test.constants import CACHED_DATASETS -from test.data_manager import TestDataManager - - -@pytest.fixture -def dataset_names(): - dataset_names = ['australian', 'blood-transfusion-service-center'] - yield dataset_names - for dataset_name in dataset_names: - if dataset_name not in CACHED_DATASETS: - TestDataManager.get_dataset_cache_path(dataset_name).unlink(missing_ok=True) - - -def test_group_load_new_datasets(dataset_names): - test_start_time = time.time() - loader = OpenMLDatasetsLoader() - loader.data_manager = TestDataManager - - datasets = loader.load(dataset_names) - - assert loader.dataset_sources == dataset_names - - for dataset_name, dataset_cache in zip(dataset_names, datasets): - check_dataset_and_cache(dataset_cache, dataset_name, dataset_cache.cache_path, test_start_time) - - -def test_load_single(dataset_names): - test_start_time = time.time() - loader = OpenMLDatasetsLoader() - loader.data_manager = TestDataManager - for dataset_name in dataset_names: - dataset_cache = loader.load_single(dataset_name) - check_dataset_and_cache(dataset_cache, dataset_name, dataset_cache.cache_path, test_start_time) - - -def test_load_new_datasets_on_demand(dataset_names): - test_start_time = time.time() - loader = OpenMLDatasetsLoader() - loader.data_manager = TestDataManager - for dataset_name in dataset_names: - cache_path = TestDataManager.get_dataset_cache_path(dataset_name) - dataset = loader.cache_to_memory(DatasetCache(dataset_name, cache_path)) - check_dataset_and_cache(dataset, dataset_name, cache_path, test_start_time) diff --git a/test/unit/test_file_system.py b/test/unit/test_file_system.py new file mode 100644 index 00000000..dba55923 --- /dev/null +++ b/test/unit/test_file_system.py @@ -0,0 +1,7 @@ +import pytest +from pathlib import Path + +from meta_automl.data_preparation.file_system import get_data_dir, get_project_root + +# def test_root_dir(): +# assert get_project_root() == diff --git a/test/unit/test_meta_features_extractors.py b/test/unit/test_meta_features_extractors.py index c5625f53..bd9b925b 100644 --- a/test/unit/test_meta_features_extractors.py +++ b/test/unit/test_meta_features_extractors.py @@ -1,38 +1,37 @@ -import time +import shutil import pytest +from meta_automl.data_preparation.dataset import OpenMLDataset +from meta_automl.data_preparation.file_system import get_dataset_cache_path_by_id, get_meta_features_cache_path from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor -from test.general_checks import assert_file_unmodified_during_test, assert_cache_file_exists -from test.data_manager import TestDataManager -from test.constants import CACHED_DATASETS, DATASETS_WITH_CACHED_META_FEATURES +from test.unit.datasets.general_checks import assert_file_unmodified_during_test, assert_cache_file_exists +from test.constants import OPENML_DATASET_IDS_TO_LOAD, OPENML_CACHED_DATASETS, DATASETS_WITH_CACHED_META_FEATURES @pytest.fixture -def dataset_names(): - dataset_names = ['australian', 'monks-problems-1', 'monks-problems-2', 'blood-transfusion-service-center'] - yield dataset_names - for dataset_name in dataset_names: - if dataset_name not in CACHED_DATASETS + DATASETS_WITH_CACHED_META_FEATURES: - TestDataManager.get_dataset_cache_path(dataset_name).unlink(missing_ok=True) - if dataset_name not in DATASETS_WITH_CACHED_META_FEATURES: - TestDataManager.get_meta_features_cache_path(dataset_name, PymfeExtractor.SOURCE).unlink(missing_ok=True) +def dataset_ids(): + dataset_ids = set(OPENML_CACHED_DATASETS + DATASETS_WITH_CACHED_META_FEATURES + OPENML_DATASET_IDS_TO_LOAD) + yield dataset_ids + for dataset_id in dataset_ids: + if dataset_id not in OPENML_CACHED_DATASETS: + dataset_cache_path = get_dataset_cache_path_by_id(OpenMLDataset, dataset_id) + shutil.rmtree(dataset_cache_path) + if dataset_id not in DATASETS_WITH_CACHED_META_FEATURES: + mf_cache_path = get_meta_features_cache_path(PymfeExtractor, dataset_id) + mf_cache_path.unlink(missing_ok=True) -def test_meta_features_extraction(dataset_names): - test_start_time = time.time() +def test_meta_features_extraction(dataset_ids): extractor = PymfeExtractor(extractor_params={'groups': 'general'}) - extractor.data_manager = TestDataManager - extractor.datasets_loader.data_manager = TestDataManager - meta_features = extractor.extract(dataset_names) - assert list(meta_features.index) == dataset_names - for dataset_name in dataset_names: - meta_features_cache_path = TestDataManager.get_meta_features_cache_path( - dataset_name, extractor.SOURCE) + meta_features = extractor.extract(dataset_ids) + assert set(meta_features.index) == dataset_ids + for dataset_id in dataset_ids: + meta_features_cache_path = get_meta_features_cache_path(PymfeExtractor, dataset_id) assert_cache_file_exists(meta_features_cache_path) - if dataset_name in DATASETS_WITH_CACHED_META_FEATURES: - assert_file_unmodified_during_test(meta_features_cache_path, test_start_time) + if dataset_id in DATASETS_WITH_CACHED_META_FEATURES: + assert_file_unmodified_during_test(meta_features_cache_path) else: - cache_path = TestDataManager.get_dataset_cache_path(dataset_name) + cache_path = get_dataset_cache_path_by_id(OpenMLDataset, dataset_id) assert_cache_file_exists(cache_path)