Skip to content

Commit

Permalink
Refactor data storage (#15)
Browse files Browse the repository at this point in the history
* refactor dataset classes, use openml cache

* fix example select_similar_datasets_by_knn.py

* create DatasetIDType

* create PredictorType

* remove DataManager, refactor cache

* update tests & test data

* allow explicit OpenMLDataset creation from name/search

* adapt examples to the last changes
  • Loading branch information
MorrisNein authored Jun 30, 2023
1 parent 7c42e79 commit cb11a3c
Show file tree
Hide file tree
Showing 60 changed files with 2,399 additions and 462 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,4 @@ dmypy.json
.pyre/

# User data
data/
/data
5 changes: 2 additions & 3 deletions examples/0_loading_data/load_list_of_datasests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@ def get_datasets():
'nomao', 'sylvine', 'kc1', 'jungle_chess_2pcs_raw_endgame_complete', 'credit-g', 'delta_ailerons', 'pol'
]
datasets_loader = OpenMLDatasetsLoader()
datasets = datasets_loader.load(dataset_names)
print(f'Datasets "{", ".join(dataset_names)}" are available at the paths:')
print('\n'.join(str(d) for d in datasets))
datasets = datasets_loader.load(dataset_names, allow_names=True)
print(f'Datasets "{", ".join(dataset_names)}" are downloaded.')
return datasets


Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import openml

from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor

Expand All @@ -6,8 +8,9 @@ def main():
dataset_names = [
'nomao', 'sylvine'
]
dataset_ids = [openml.datasets.get_dataset(name, download_data=False, download_qualities=False).dataset_id for name in dataset_names]
extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
meta_features = extractor.extract(dataset_names)
meta_features = extractor.extract(dataset_ids)
return meta_features


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ def main():
loader = OpenMLDatasetsLoader()
extractor = PymfeExtractor(extractor_params={'groups': 'general'})

cached_datasets = loader.load(dataset_names)
meta_features = extractor.extract(cached_datasets)
datasets = loader.load(dataset_names, allow_names=True)
meta_features = extractor.extract(datasets)
return meta_features


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
def main():
# Define datasets.
dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing']
datasets = OpenMLDatasetsLoader().load(dataset_names, allow_names=True)
# Extract meta-features and load on demand.
extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
meta_features = extractor.extract(dataset_names)
extractor = PymfeExtractor(extractor_params={'groups': 'general'})
meta_features = extractor.extract(datasets)
# Preprocess meta-features, as KNN does not support NaNs.
meta_features = meta_features.dropna(axis=1, how='any')
# Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from golem.core.optimisers.fitness import SingleObjFitness
from sklearn.model_selection import train_test_split

from meta_automl.data_preparation.dataset import DatasetCache
from meta_automl.data_preparation.dataset import OpenMLDataset
from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
from meta_automl.data_preparation.model import Model
Expand All @@ -13,9 +13,10 @@
def main():
# Define datasets.
dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing']
datasets = OpenMLDatasetsLoader().load(dataset_names, allow_names=True)
# Extract meta-features and load on demand.
extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
meta_features = extractor.extract(dataset_names)
extractor = PymfeExtractor(extractor_params={'groups': 'general'})
meta_features = extractor.extract(datasets)
# Preprocess meta-features, as KNN does not support NaNs.
meta_features = meta_features.dropna(axis=1, how='any')
# Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
Expand All @@ -29,8 +30,8 @@ def main():
PipelineBuilder().add_node('normalization').add_node('logit').build(),
PipelineBuilder().add_node('rf').add_node('logit').build()
]
best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', DatasetCache(dataset_name))]
for dataset_name, pipeline in zip(y_train, best_pipelines)]
best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', OpenMLDataset(dataset_id))]
for dataset_id, pipeline in zip(y_train, best_pipelines)]

dataset_names_to_best_pipelines = dict(zip(y_train, best_models))
advisor = DiverseFEDOTPipelineAdvisor(assessor, minimal_distance=2).fit(dataset_names_to_best_pipelines)
Expand Down
6 changes: 3 additions & 3 deletions examples/knowledge_base_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
# ===== Another way to get train models, but also group them by datasets:
models_for_train = {}

for dataset_name in train_datasets['dataset_name']:
for dataset_id in train_datasets['dataset_id']:
dataset_models = models_loader.load(
dataset_names=[dataset_name], # load models just for this exact dataset.
dataset_ids=[dataset_id], # load models just for this exact dataset.
fitness_metric='logloss', # must correspond to a metric name in a knowledge base.
)
models_for_train[dataset_name] = dataset_models
models_for_train[dataset_id] = dataset_models

# If you need to load data to the local storage
# dataset = OpenMLDatasetsLoader().load_single(dataset_name)
Expand Down
96 changes: 49 additions & 47 deletions experiments/fedot_warm_start/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

from meta_automl.data_preparation.data_manager import DataManager
from meta_automl.data_preparation.dataset import DatasetCache, Dataset

from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData
from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split
from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
Expand All @@ -37,8 +37,8 @@
N_DATASETS = 3
TEST_SIZE = 0.33
# Evaluation timeouts
TRAIN_TIMEOUT = 1
TEST_TIMEOUT = 1
TRAIN_TIMEOUT = 0.01
TEST_TIMEOUT = 0.01
# Models & datasets
N_BEST_DATASET_MODELS_TO_MEMORIZE = 10
N_CLOSEST_DATASETS_TO_PROPOSE = 5
Expand All @@ -61,7 +61,7 @@
time_now = datetime.now()
time_now_iso = time_now.isoformat(timespec="minutes")
time_now_for_path = time_now_iso.replace(":", ".")
save_dir = DataManager.get_data_dir(). \
save_dir = get_data_dir(). \
joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}')
save_dir.mkdir(parents=True)
log_file = save_dir.joinpath('log.txt')
Expand All @@ -75,18 +75,23 @@
)


def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
def prepare_data() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset]]:
"""Returns dictionary with dataset names and cached datasets downloaded from OpenML."""

dataset_ids = openml.study.get_suite(99).data
if N_DATASETS is not None:
dataset_ids = pd.Series(dataset_ids)
dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED)
dataset_ids = list(dataset_ids)
return dataset_ids, {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)}

df_split_datasets = openml_datasets_train_test_split(dataset_ids, seed=SEED)
df_datasets_train = df_split_datasets[df_split_datasets['is_train'] == 1]
df_datasets_test = df_split_datasets[df_split_datasets['is_train'] == 0]

datasets = {dataset.id_: dataset for dataset in OpenMLDatasetsLoader().load(dataset_ids)}
return df_datasets_train, df_datasets_test, datasets

def transform_data_for_fedot(data: Dataset) -> (np.array, np.array):

def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array):
x = data.x
y = data.y
if len(y.shape) == 1:
Expand Down Expand Up @@ -127,8 +132,8 @@ def prepare_extractor_and_assessor(datasets_train: List[str]):
return data_similarity_assessor, extractor


def fit_fedot(data: Dataset, timeout: float, run_label: str, initial_assumption=None):
x, y = transform_data_for_fedot(data)
def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_assumption=None):
x, y = transform_data_for_fedot(dataset.get_data(dataset_format='array'))

time_start = timeit.default_timer()
fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **COMMON_FEDOT_PARAMS)
Expand All @@ -137,14 +142,14 @@ def fit_fedot(data: Dataset, timeout: float, run_label: str, initial_assumption=

metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data)
pipeline = fedot.current_pipeline
run_results = get_result_data_row(dataset=data, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time,
run_results = get_result_data_row(dataset=dataset, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time,
automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics)
return fedot, run_results


def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0., automl_timeout_min=0.,
**metrics):
run_results = dict(dataset_id=dataset.id,
def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0.,
automl_timeout_min=0., **metrics):
run_results = dict(dataset_id=dataset.id_,
dataset_name=dataset.name,
run_label=run_label,
model_obj=pipeline,
Expand All @@ -157,38 +162,35 @@ def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, aut
return run_results


def extract_best_history_models(dataset_cache, history):
def extract_best_history_models(dataset, history):
best_individuals = sorted(chain(*history.individuals),
key=lambda ind: ind.fitness,
reverse=True)
best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values())
best_models = []
for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
pipeline = PipelineAdapter().restore(individual.graph)
model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset_cache)
model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset)
best_models.append(model)
return best_models


def main():
baseline_pipeline = PipelineBuilder().add_node('rf').build()

dataset_ids, datasets_cache = prepare_data()
df_datasets_train, df_datasets_test, datasets = prepare_data()

split_datasets = openml_datasets_train_test_split(dataset_ids, seed=SEED)
datasets_train = split_datasets[split_datasets['is_train'] == 1]['dataset_name'].to_list()
datasets_test = split_datasets[~split_datasets['is_train'] == 0]['dataset_name'].to_list()
dataset_ids_train = df_datasets_train.index.to_list()
dataset_ids_test = df_datasets_test.index.to_list()

evaluation_results = []
best_models_per_dataset = {}
progress_file = open(save_dir.joinpath('progress.txt'), 'a')
for name in tqdm(datasets_cache.keys(), 'FEDOT, all datasets', file=progress_file):
for dataset_id in tqdm(datasets.keys(), 'FEDOT, all datasets', file=progress_file):
try:
cache = datasets_cache[name]
data = cache.from_cache()

timeout = TRAIN_TIMEOUT if name in datasets_train else TEST_TIMEOUT
fedot, run_results = fit_fedot(data=data, timeout=timeout, run_label='FEDOT')
dataset = datasets[dataset_id]
timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_train else TEST_TIMEOUT
fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT')
evaluation_results.append(run_results)
# TODO:
# x Turn the tuned pipeline into a model (evaluate its fitness on the data)
Expand All @@ -197,50 +199,49 @@ def main():

# Filter out unique individuals with the best fitness
history = fedot.history
best_models = extract_best_history_models(cache, history)
best_models_per_dataset[name] = best_models
best_models = extract_best_history_models(dataset, history)
best_models_per_dataset[dataset_id] = best_models
except Exception:
logging.exception(f'Train dataset "{name}"')
logging.exception(f'Train dataset "{dataset_id}"')

data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train)
data_similarity_assessor, extractor = prepare_extractor_and_assessor(dataset_ids_train)
model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE,
minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
model_advisor.fit(best_models_per_dataset)

for name in tqdm(datasets_test, 'MetaFEDOT, Test datasets', file=progress_file):
for dataset_id in tqdm(dataset_ids_test, 'MetaFEDOT, Test datasets', file=progress_file):
try:
cache = datasets_cache[name]
data = cache.from_cache()
dataset = datasets[dataset_id]

# Run meta AutoML
# 1
time_start = timeit.default_timer()
meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True)
meta_features = extractor.extract([dataset], fill_input_nans=True, use_cached=False, update_cached=True)
meta_features = meta_features.fillna(0)
meta_learning_time_sec = timeit.default_timer() - time_start
initial_assumptions = model_advisor.predict(meta_features)[0]
assumption_pipelines = [model.predictor for model in initial_assumptions]
# 2
fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
fedot_meta, fedot_meta_results = fit_fedot(dataset=dataset, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
initial_assumption=assumption_pipelines)
fedot_meta_results['meta_learning_time_sec'] = meta_learning_time_sec
evaluation_results.append(fedot_meta_results)

# Fit & evaluate simple baseline
baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data)
baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline,
baseline_res = get_result_data_row(dataset=dataset, run_label='simple baseline', pipeline=baseline_pipeline,
**baseline_metrics)
evaluation_results.append(baseline_res)

# Fit & evaluate initial assumptions
for i, assumption in enumerate(initial_assumptions):
pipeline = assumption.predictor
assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data)
assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}',
assumption_res = get_result_data_row(dataset=dataset, run_label=f'MetaFEDOT - initial assumption {i}',
pipeline=pipeline, **assumption_metrics)
evaluation_results.append(assumption_res)
except Exception:
logging.exception(f'Test dataset "{name}"')
logging.exception(f'Test dataset "{dataset_id}"')
progress_file.close()

# Save the accumulated results
Expand All @@ -250,11 +251,11 @@ def main():
for res in evaluation_results:
try:
res['run_date'] = time_now
dataset_name = res['dataset_name']
dataset_id = res['dataset_id']
run_label = res['run_label']
# define saving paths
model_path = models_dir.joinpath(f'{dataset_name}_{run_label}')
history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json')
model_path = models_dir.joinpath(f'{dataset_id}_{run_label}')
history_path = history_dir.joinpath(f'{dataset_id}_{run_label}_history.json')
# replace objects with export paths for csv
res['model_path'] = str(model_path)
res.pop('model_obj').save(res['model_path'])
Expand All @@ -271,12 +272,13 @@ def main():
params = {
'run_date': time_now_iso,
'seed': SEED,
'n_datasets': N_DATASETS or len(dataset_ids),
'n_datasets': N_DATASETS or len(datasets),
'test_size': TEST_SIZE,
'dataset_ids': dataset_ids,
'dataset_names': list(datasets_cache.keys()),
'dataset_names_train': datasets_train,
'dataset_names_test': datasets_test,
'dataset_ids': list(datasets.keys()),
'dataset_ids_train': dataset_ids_train,
'dataset_ids_test': dataset_ids_test,
'dataset_names_train': df_datasets_train['dataset_name'].to_list(),
'dataset_names_test': df_datasets_test['dataset_name'].to_list(),
'train_timeout': TRAIN_TIMEOUT,
'test_timeout': TEST_TIMEOUT,
'n_best_dataset_models_to_memorize': N_BEST_DATASET_MODELS_TO_MEMORIZE,
Expand Down
Loading

0 comments on commit cb11a3c

Please sign in to comment.