Skip to content

Commit

Permalink
Refactor data storage (#15)
Browse files Browse the repository at this point in the history
* refactor dataset classes, use openml cache

* fix example select_similar_datasets_by_knn.py

* create DatasetIDType

* create PredictorType

* remove DataManager, refactor cache

* update tests & test data

* allow explicit OpenMLDataset creation from name/search

* adapt examples to the last changes
  • Loading branch information
MorrisNein committed Jul 20, 2023
1 parent 267e6f9 commit 5261b8f
Show file tree
Hide file tree
Showing 59 changed files with 2,350 additions and 415 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,4 @@ dmypy.json
.pyre/

# User data
data/
/data
5 changes: 2 additions & 3 deletions examples/0_loading_data/load_list_of_datasests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@ def get_datasets():
'nomao', 'sylvine', 'kc1', 'jungle_chess_2pcs_raw_endgame_complete', 'credit-g', 'delta_ailerons', 'pol'
]
datasets_loader = OpenMLDatasetsLoader()
datasets = datasets_loader.load(dataset_names)
print(f'Datasets "{", ".join(dataset_names)}" are available at the paths:')
print('\n'.join(str(d) for d in datasets))
datasets = datasets_loader.load(dataset_names, allow_names=True)
print(f'Datasets "{", ".join(dataset_names)}" are downloaded.')
return datasets


Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import openml

from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor

Expand All @@ -6,8 +8,9 @@ def main():
dataset_names = [
'nomao', 'sylvine'
]
dataset_ids = [openml.datasets.get_dataset(name, download_data=False, download_qualities=False).dataset_id for name in dataset_names]
extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
meta_features = extractor.extract(dataset_names)
meta_features = extractor.extract(dataset_ids)
return meta_features


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ def main():
loader = OpenMLDatasetsLoader()
extractor = PymfeExtractor(extractor_params={'groups': 'general'})

cached_datasets = loader.load(dataset_names)
meta_features = extractor.extract(cached_datasets)
datasets = loader.load(dataset_names, allow_names=True)
meta_features = extractor.extract(datasets)
return meta_features


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
def main():
# Define datasets.
dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing']
datasets = OpenMLDatasetsLoader().load(dataset_names, allow_names=True)
# Extract meta-features and load on demand.
extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
meta_features = extractor.extract(dataset_names)
extractor = PymfeExtractor(extractor_params={'groups': 'general'})
meta_features = extractor.extract(datasets)
# Preprocess meta-features, as KNN does not support NaNs.
meta_features = meta_features.dropna(axis=1, how='any')
# Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from golem.core.optimisers.fitness import SingleObjFitness
from sklearn.model_selection import train_test_split

from meta_automl.data_preparation.dataset import DatasetCache
from meta_automl.data_preparation.dataset import OpenMLDataset
from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
from meta_automl.data_preparation.model import Model
Expand All @@ -13,9 +13,10 @@
def main():
# Define datasets.
dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing']
datasets = OpenMLDatasetsLoader().load(dataset_names, allow_names=True)
# Extract meta-features and load on demand.
extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
meta_features = extractor.extract(dataset_names)
extractor = PymfeExtractor(extractor_params={'groups': 'general'})
meta_features = extractor.extract(datasets)
# Preprocess meta-features, as KNN does not support NaNs.
meta_features = meta_features.dropna(axis=1, how='any')
# Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
Expand All @@ -29,8 +30,8 @@ def main():
PipelineBuilder().add_node('normalization').add_node('logit').build(),
PipelineBuilder().add_node('rf').add_node('logit').build()
]
best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', DatasetCache(dataset_name))]
for dataset_name, pipeline in zip(y_train, best_pipelines)]
best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', OpenMLDataset(dataset_id))]
for dataset_id, pipeline in zip(y_train, best_pipelines)]

dataset_names_to_best_pipelines = dict(zip(y_train, best_models))
advisor = DiverseFEDOTPipelineAdvisor(assessor, minimal_distance=2).fit(dataset_names_to_best_pipelines)
Expand Down
6 changes: 3 additions & 3 deletions examples/knowledge_base_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
# ===== Another way to get train models, but also group them by datasets:
models_for_train = {}

for dataset_name in train_datasets['dataset_name']:
for dataset_id in train_datasets['dataset_id']:
dataset_models = models_loader.load(
dataset_names=[dataset_name], # load models just for this exact dataset.
dataset_ids=[dataset_id], # load models just for this exact dataset.
fitness_metric='logloss', # must correspond to a metric name in a knowledge base.
)
models_for_train[dataset_name] = dataset_models
models_for_train[dataset_id] = dataset_models

# If you need to load data to the local storage
# dataset = OpenMLDatasetsLoader().load_single(dataset_name)
Expand Down
59 changes: 0 additions & 59 deletions meta_automl/data_preparation/data_manager.py

This file was deleted.

64 changes: 0 additions & 64 deletions meta_automl/data_preparation/dataset.py

This file was deleted.

3 changes: 3 additions & 0 deletions meta_automl/data_preparation/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .dataset_base import DatasetBase, DatasetData, DatasetIDType
from .custom_dataset import DataNotFoundError, CustomDataset
from .openml_dataset import OpenMLDataset, OpenMLDatasetIDType
30 changes: 30 additions & 0 deletions meta_automl/data_preparation/dataset/custom_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from __future__ import annotations

import pickle
from pathlib import Path
from typing import Optional

from meta_automl.data_preparation.dataset import DatasetBase
from meta_automl.data_preparation.dataset.dataset_base import DatasetData



class DataNotFoundError(FileNotFoundError):
pass


class CustomDataset(DatasetBase):

def get_data(self, cache_path: Optional[Path] = None) -> DatasetData:
cache_path = cache_path or self.cache_path
if not cache_path.exists():
raise DataNotFoundError(f'Dataset {self} is missing by the path "{cache_path}".')
with open(cache_path, 'rb') as f:
dataset_data = pickle.load(f)
return dataset_data

def dump_data(self, dataset_data: DatasetData, cache_path: Optional[Path] = None) -> CustomDataset:
cache_path = cache_path or self.cache_path
with open(cache_path, 'wb') as f:
pickle.dump(dataset_data, f)
return self
40 changes: 40 additions & 0 deletions meta_automl/data_preparation/dataset/dataset_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations

from abc import abstractmethod, ABC
from dataclasses import dataclass
from pathlib import Path
from typing import Union, Optional, List, Any

import numpy as np
import pandas as pd
import scipy as sp

from meta_automl.data_preparation.file_system import CacheOperator, get_dataset_cache_path

DatasetIDType = Any


@dataclass
class DatasetData:
x: Union[np.ndarray, pd.DataFrame, sp.sparse.csr_matrix]
y: Optional[Union[np.ndarray, pd.DataFrame]] = None
categorical_indicator: Optional[List[bool]] = None
attribute_names: Optional[List[str]] = None


class DatasetBase(ABC, CacheOperator):

def __init__(self, id_: DatasetIDType, name: Optional[str] = None):
self.id_ = id_
self.name = name

def __repr__(self):
return f'{self.__class__.__name__}(id_={self.id_}, name={self.name})'

@abstractmethod
def get_data(self) -> DatasetData:
raise NotImplementedError()

@property
def cache_path(self) -> Path:
return get_dataset_cache_path(self)
39 changes: 39 additions & 0 deletions meta_automl/data_preparation/dataset/openml_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from __future__ import annotations

from typing import Union

import openml

from meta_automl.data_preparation.dataset import DatasetBase
from meta_automl.data_preparation.dataset.dataset_base import DatasetData
from meta_automl.data_preparation.file_system import update_openml_cache_dir

OpenMLDatasetIDType = int

update_openml_cache_dir()


class OpenMLDataset(DatasetBase):

def __init__(self, id_: OpenMLDatasetIDType):
if isinstance(id_, str):
raise ValueError('Creating OpenMLDataset by dataset name is ambiguous. Please, use dataset id.'
f'Otherwise, you can perform search by f{self.__class__.__name__}.from_search().')
self._openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False,
error_if_multiple=True)
id_ = self._openml_dataset.id
name = self._openml_dataset.name
super().__init__(id_, name)

@classmethod
def from_search(cls, id_: Union[OpenMLDatasetIDType, str], **get_dataset_kwargs) -> OpenMLDataset:
openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False,
**get_dataset_kwargs)
return cls(openml_dataset.id)

def get_data(self, dataset_format: str = 'dataframe') -> DatasetData:
X, y, categorical_indicator, attribute_names = self._openml_dataset.get_data(
target=self._openml_dataset.default_target_attribute,
dataset_format=dataset_format
)
return DatasetData(X, y, categorical_indicator, attribute_names)
2 changes: 1 addition & 1 deletion meta_automl/data_preparation/datasets_loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .datasets_loader import DatasetsLoader
from .openml_datasets_loader import OpenMLDatasetsLoader, OpenMLDatasetID
from .openml_datasets_loader import OpenMLDatasetsLoader
16 changes: 4 additions & 12 deletions meta_automl/data_preparation/datasets_loaders/datasets_loader.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,17 @@
from __future__ import annotations

from abc import abstractmethod
from typing import List, Type
from typing import List

from meta_automl.data_preparation.data_manager import DataManager
from meta_automl.data_preparation.dataset import Dataset, DatasetCache, NoCacheError
from meta_automl.data_preparation.dataset import DatasetBase


class DatasetsLoader:
data_manager: Type[DataManager] = DataManager

@abstractmethod
def load(self, *args, **kwargs) -> List[DatasetCache]:
def load(self, *args, **kwargs) -> List[DatasetBase]:
raise NotImplementedError()

@abstractmethod
def load_single(self, *args, **kwargs) -> DatasetCache:
def load_single(self, *args, **kwargs) -> DatasetBase:
raise NotImplementedError()

def cache_to_memory(self, dataset: DatasetCache) -> Dataset:
try:
return dataset.from_cache()
except NoCacheError:
return self.load_single(dataset.id).from_cache()
Loading

0 comments on commit 5261b8f

Please sign in to comment.