Refactor data storage (#15)

* refactor dataset classes, use openml cache * fix example select_similar_datasets_by_knn.py * create DatasetIDType * create PredictorType * remove DataManager, refactor cache * update tests & test data * allow explicit OpenMLDataset creation from name/search * adapt examples to the last changes
ITMO-NSS-team · Jul 20, 2023 · 5261b8f · 5261b8f
1 parent 267e6f9
commit 5261b8f
Show file tree

Hide file tree

Showing 59 changed files with 2,350 additions and 415 deletions.
diff --git a/.gitignore b/.gitignore
@@ -129,4 +129,4 @@ dmypy.json
 .pyre/
 
 # User data
-data/
+/data
diff --git a/examples/0_loading_data/load_list_of_datasests.py b/examples/0_loading_data/load_list_of_datasests.py
@@ -6,9 +6,8 @@ def get_datasets():
         'nomao', 'sylvine', 'kc1', 'jungle_chess_2pcs_raw_endgame_complete', 'credit-g', 'delta_ailerons', 'pol'
     ]
     datasets_loader = OpenMLDatasetsLoader()
-    datasets = datasets_loader.load(dataset_names)
-    print(f'Datasets "{", ".join(dataset_names)}" are available at the paths:')
-    print('\n'.join(str(d) for d in datasets))
+    datasets = datasets_loader.load(dataset_names, allow_names=True)
+    print(f'Datasets "{", ".join(dataset_names)}" are downloaded.')
     return datasets
 
 

diff --git a/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py b/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py
@@ -1,3 +1,5 @@
+import openml
+
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
 
@@ -6,8 +8,9 @@ def main():
     dataset_names = [
         'nomao', 'sylvine'
     ]
+    dataset_ids = [openml.datasets.get_dataset(name, download_data=False, download_qualities=False).dataset_id for name in dataset_names]
     extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
-    meta_features = extractor.extract(dataset_names)
+    meta_features = extractor.extract(dataset_ids)
     return meta_features
 
 

diff --git a/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py b/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py
@@ -9,8 +9,8 @@ def main():
     loader = OpenMLDatasetsLoader()
     extractor = PymfeExtractor(extractor_params={'groups': 'general'})
 
-    cached_datasets = loader.load(dataset_names)
-    meta_features = extractor.extract(cached_datasets)
+    datasets = loader.load(dataset_names, allow_names=True)
+    meta_features = extractor.extract(datasets)
     return meta_features
 
 

diff --git a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py
@@ -8,9 +8,10 @@
 def main():
     # Define datasets.
     dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing']
+    datasets = OpenMLDatasetsLoader().load(dataset_names, allow_names=True)
     # Extract meta-features and load on demand.
-    extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
-    meta_features = extractor.extract(dataset_names)
+    extractor = PymfeExtractor(extractor_params={'groups': 'general'})
+    meta_features = extractor.extract(datasets)
     # Preprocess meta-features, as KNN does not support NaNs.
     meta_features = meta_features.dropna(axis=1, how='any')
     # Split datasets to train (preprocessing) and test (actual meta-algorithm objects).

diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py b/examples/4_advising_models/advise_models_from_similar_datasets.py
@@ -2,7 +2,7 @@
 from golem.core.optimisers.fitness import SingleObjFitness
 from sklearn.model_selection import train_test_split
 
-from meta_automl.data_preparation.dataset import DatasetCache
+from meta_automl.data_preparation.dataset import OpenMLDataset
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
 from meta_automl.data_preparation.model import Model
@@ -13,9 +13,10 @@
 def main():
     # Define datasets.
     dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing']
+    datasets = OpenMLDatasetsLoader().load(dataset_names, allow_names=True)
     # Extract meta-features and load on demand.
-    extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
-    meta_features = extractor.extract(dataset_names)
+    extractor = PymfeExtractor(extractor_params={'groups': 'general'})
+    meta_features = extractor.extract(datasets)
     # Preprocess meta-features, as KNN does not support NaNs.
     meta_features = meta_features.dropna(axis=1, how='any')
     # Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
@@ -29,8 +30,8 @@ def main():
         PipelineBuilder().add_node('normalization').add_node('logit').build(),
         PipelineBuilder().add_node('rf').add_node('logit').build()
     ]
-    best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', DatasetCache(dataset_name))]
-                   for dataset_name, pipeline in zip(y_train, best_pipelines)]
+    best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', OpenMLDataset(dataset_id))]
+                   for dataset_id, pipeline in zip(y_train, best_pipelines)]
 
     dataset_names_to_best_pipelines = dict(zip(y_train, best_models))
     advisor = DiverseFEDOTPipelineAdvisor(assessor, minimal_distance=2).fit(dataset_names_to_best_pipelines)

diff --git a/examples/knowledge_base_loading.py b/examples/knowledge_base_loading.py
@@ -16,12 +16,12 @@
     # ===== Another way to get train models, but also group them by datasets:
     models_for_train = {}
 
-    for dataset_name in train_datasets['dataset_name']:
+    for dataset_id in train_datasets['dataset_id']:
         dataset_models = models_loader.load(
-            dataset_names=[dataset_name],   # load models just for this exact dataset.
+            dataset_ids=[dataset_id],   # load models just for this exact dataset.
             fitness_metric='logloss',       # must correspond to a metric name in a knowledge base.
         )
-        models_for_train[dataset_name] = dataset_models
+        models_for_train[dataset_id] = dataset_models
 
         # If you need to load data to the local storage
         # dataset = OpenMLDatasetsLoader().load_single(dataset_name)

diff --git a/meta_automl/data_preparation/data_manager.py b/meta_automl/data_preparation/data_manager.py
diff --git a/meta_automl/data_preparation/dataset.py b/meta_automl/data_preparation/dataset.py
diff --git a/meta_automl/data_preparation/dataset/__init__.py b/meta_automl/data_preparation/dataset/__init__.py
@@ -0,0 +1,3 @@
+from .dataset_base import DatasetBase, DatasetData, DatasetIDType
+from .custom_dataset import DataNotFoundError, CustomDataset
+from .openml_dataset import OpenMLDataset, OpenMLDatasetIDType
diff --git a/meta_automl/data_preparation/dataset/custom_dataset.py b/meta_automl/data_preparation/dataset/custom_dataset.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+import pickle
+from pathlib import Path
+from typing import Optional
+
+from meta_automl.data_preparation.dataset import DatasetBase
+from meta_automl.data_preparation.dataset.dataset_base import DatasetData
+
+
+
+class DataNotFoundError(FileNotFoundError):
+    pass
+
+
+class CustomDataset(DatasetBase):
+
+    def get_data(self, cache_path: Optional[Path] = None) -> DatasetData:
+        cache_path = cache_path or self.cache_path
+        if not cache_path.exists():
+            raise DataNotFoundError(f'Dataset {self} is missing by the path "{cache_path}".')
+        with open(cache_path, 'rb') as f:
+            dataset_data = pickle.load(f)
+        return dataset_data
+
+    def dump_data(self, dataset_data: DatasetData, cache_path: Optional[Path] = None) -> CustomDataset:
+        cache_path = cache_path or self.cache_path
+        with open(cache_path, 'wb') as f:
+            pickle.dump(dataset_data, f)
+        return self
diff --git a/meta_automl/data_preparation/dataset/dataset_base.py b/meta_automl/data_preparation/dataset/dataset_base.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from abc import abstractmethod, ABC
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Union, Optional, List, Any
+
+import numpy as np
+import pandas as pd
+import scipy as sp
+
+from meta_automl.data_preparation.file_system import CacheOperator, get_dataset_cache_path
+
+DatasetIDType = Any
+
+
+@dataclass
+class DatasetData:
+    x: Union[np.ndarray, pd.DataFrame, sp.sparse.csr_matrix]
+    y: Optional[Union[np.ndarray, pd.DataFrame]] = None
+    categorical_indicator: Optional[List[bool]] = None
+    attribute_names: Optional[List[str]] = None
+
+
+class DatasetBase(ABC, CacheOperator):
+
+    def __init__(self, id_: DatasetIDType, name: Optional[str] = None):
+        self.id_ = id_
+        self.name = name
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(id_={self.id_}, name={self.name})'
+
+    @abstractmethod
+    def get_data(self) -> DatasetData:
+        raise NotImplementedError()
+
+    @property
+    def cache_path(self) -> Path:
+        return get_dataset_cache_path(self)
diff --git a/meta_automl/data_preparation/dataset/openml_dataset.py b/meta_automl/data_preparation/dataset/openml_dataset.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+from typing import Union
+
+import openml
+
+from meta_automl.data_preparation.dataset import DatasetBase
+from meta_automl.data_preparation.dataset.dataset_base import DatasetData
+from meta_automl.data_preparation.file_system import update_openml_cache_dir
+
+OpenMLDatasetIDType = int
+
+update_openml_cache_dir()
+
+
+class OpenMLDataset(DatasetBase):
+
+    def __init__(self, id_: OpenMLDatasetIDType):
+        if isinstance(id_, str):
+            raise ValueError('Creating OpenMLDataset by dataset name is ambiguous. Please, use dataset id.'
+                             f'Otherwise, you can perform search by f{self.__class__.__name__}.from_search().')
+        self._openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False,
+                                                           error_if_multiple=True)
+        id_ = self._openml_dataset.id
+        name = self._openml_dataset.name
+        super().__init__(id_, name)
+
+    @classmethod
+    def from_search(cls, id_: Union[OpenMLDatasetIDType, str], **get_dataset_kwargs) -> OpenMLDataset:
+        openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False,
+                                                     **get_dataset_kwargs)
+        return cls(openml_dataset.id)
+
+    def get_data(self, dataset_format: str = 'dataframe') -> DatasetData:
+        X, y, categorical_indicator, attribute_names = self._openml_dataset.get_data(
+            target=self._openml_dataset.default_target_attribute,
+            dataset_format=dataset_format
+        )
+        return DatasetData(X, y, categorical_indicator, attribute_names)
diff --git a/meta_automl/data_preparation/datasets_loaders/__init__.py b/meta_automl/data_preparation/datasets_loaders/__init__.py
@@ -1,2 +1,2 @@
 from .datasets_loader import DatasetsLoader
-from .openml_datasets_loader import OpenMLDatasetsLoader, OpenMLDatasetID
+from .openml_datasets_loader import OpenMLDatasetsLoader
diff --git a/meta_automl/data_preparation/datasets_loaders/datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/datasets_loader.py
@@ -1,25 +1,17 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-from typing import List, Type
+from typing import List
 
-from meta_automl.data_preparation.data_manager import DataManager
-from meta_automl.data_preparation.dataset import Dataset, DatasetCache, NoCacheError
+from meta_automl.data_preparation.dataset import DatasetBase
 
 
 class DatasetsLoader:
-    data_manager: Type[DataManager] = DataManager
 
     @abstractmethod
-    def load(self, *args, **kwargs) -> List[DatasetCache]:
+    def load(self, *args, **kwargs) -> List[DatasetBase]:
         raise NotImplementedError()
 
     @abstractmethod
-    def load_single(self, *args, **kwargs) -> DatasetCache:
+    def load_single(self, *args, **kwargs) -> DatasetBase:
         raise NotImplementedError()
-
-    def cache_to_memory(self, dataset: DatasetCache) -> Dataset:
-        try:
-            return dataset.from_cache()
-        except NoCacheError:
-            return self.load_single(dataset.id).from_cache()