-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
63 lines (48 loc) · 1.6 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from __future__ import annotations
from dataclasses import dataclass
import openml
import pandas as pd
from openml import OpenMLDataset
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
@dataclass
class Dataset:
name: str
id: int
features: pd.DataFrame
labels: pd.DataFrame
openml: OpenMLDataset
encoders: dict[str, LabelEncoder]
@staticmethod
def from_openml(id: int) -> Dataset:
"""Processes an binary classification OpenMLDataset into its features and targets
Parameters
----------
id: int
The id of the dataset
Returns
-------
Dataset
"""
dataset = openml.datasets.get_dataset(id)
target = dataset.default_target_attribute
data, _, _, _ = dataset.get_data()
assert isinstance(data, pd.DataFrame)
# Process the features and turn SOME categorical columns into ints
features = data.drop(columns=target)
encoders: dict[str, LabelEncoder] = {}
encoder = LabelEncoder()
for name, col in features.items():
if str(col.dtype) in ["object", "category", "string"]:
features[name] = encoder.fit_transform(col)
encoders[name] = encoder
labels = encoder.fit_transform(data[target])
# Since we assume binary classification, we convert the labels
# labels = labels.astype(bool)
return Dataset(
name=dataset.name,
id=id,
features=features,
labels=labels,
openml=dataset,
encoders=encoders,
)