Skip to content

Commit

Permalink
add datasets train/test split
Browse files Browse the repository at this point in the history
  • Loading branch information
MorrisNein committed Jul 20, 2023
1 parent 0c1c9c7 commit 267e6f9
Showing 1 changed file with 64 additions and 0 deletions.
64 changes: 64 additions & 0 deletions meta_automl/data_preparation/datasets_train_test_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import openml
import pandas as pd

from sklearn.model_selection import train_test_split


def openml_datasets_train_test_split(dataset_ids, train_size: float = 0.7, seed: int = 42):
df_openml_datasets = openml.datasets.list_datasets(dataset_ids, output_format='dataframe')
df_openml_datasets_split_features = df_openml_datasets[
['name', 'NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses']]
for column in df_openml_datasets_split_features.columns[1:]:
if column != 'NumberOfClasses':
median = df_openml_datasets_split_features[column].median()
df_openml_datasets_split_features[column] = \
(df_openml_datasets_split_features[column] > median).map({False: 'small', True: 'big'})
else:
median = df_openml_datasets_split_features[column][df_openml_datasets_split_features[column] != 2].median()
df_openml_datasets_split_features[column] = df_openml_datasets_split_features[column].apply(
lambda n: 'binary' if n == 2 else {False: 'small', True: 'big'}[n > median])
df_split_categories = df_openml_datasets_split_features.copy()
df_split_categories['category'] = df_openml_datasets_split_features.apply(lambda row: '_'.join(
row[1:]), axis=1)
df_split_categories.drop(columns=['NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses'], inplace=True)
# Group single-value categories into a separate category
cat_counts = df_split_categories['category'].value_counts()
single_value_categories = cat_counts[cat_counts == 1].index
idx = df_split_categories[df_split_categories['category'].isin(single_value_categories)].index
df_split_categories.loc[idx, 'category'] = 'single_value'
df_datasets_to_split = df_split_categories[df_split_categories['category'] != 'single_value']
df_test_only_datasets = df_split_categories[df_split_categories['category'] == 'single_value']
if not df_datasets_to_split.empty:
df_train_datasets, df_test_datasets = train_test_split(
df_datasets_to_split,
train_size=train_size,
shuffle=True,
stratify=df_datasets_to_split['category'],
random_state=seed
)
df_test_datasets = pd.concat([df_test_datasets, df_test_only_datasets])
else:
df_train_datasets, df_test_datasets = train_test_split(
df_split_categories,
train_size=train_size,
shuffle=True,
random_state=seed
)
df_train_datasets['is_train'] = 1
df_test_datasets['is_train'] = 0
df_split_datasets = pd.concat([df_train_datasets, df_test_datasets]).join(
df_openml_datasets_split_features.drop(columns='name'))
df_split_datasets = df_split_datasets.rename(columns={'name': 'dataset_name'})
df_split_datasets.index.rename('dataset_id', inplace=True)

return df_split_datasets


def main():
dataset_ids = openml.study.get_suite(99).data
df_split_datasets = openml_datasets_train_test_split(dataset_ids)
df_split_datasets.to_csv('train_test_datasets_opencc18.csv')


if __name__ == '__main__':
main()

0 comments on commit 267e6f9

Please sign in to comment.