forked from gheisenberg/AML
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
52 lines (43 loc) · 1.92 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas as pd
# Makes sure you see all columns
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
class DataLoader():
def __init__(self):
self.data = None
def load_dataset(self, path="./data/healthcare-dataset-stroke-data.csv"):
self.data = pd.read_csv(path)
def preprocess_data(self):
# One-hot encode all categorical columns
categorical_cols = ["gender",
"ever_married",
"work_type",
"Residence_type",
"smoking_status"]
encoded = pd.get_dummies(self.data[categorical_cols],
prefix=categorical_cols)
# Update data with new columns
self.data = pd.concat([encoded, self.data], axis=1)
self.data.drop(categorical_cols, axis=1, inplace=True)
# Impute missing values of BMI
self.data.bmi = self.data.bmi.fillna(0)
# Drop id as it is not relevant
self.data.drop(["id"], axis=1, inplace=True)
# Standardization
# Usually we would standardize here and convert it back later
# But for simplification we will not standardize / normalize the features
def get_data_split(self):
X = self.data.iloc[:,:-1]
y = self.data.iloc[:,-1]
return train_test_split(X, y, test_size=0.20, random_state=2022)
def oversample(self, X_train, y_train):
oversample = RandomOverSampler(sampling_strategy='minority')
# Convert to numpy and oversample
x_np = X_train.to_numpy()
y_np = y_train.to_numpy()
x_np, y_np = oversample.fit_resample(x_np, y_np)
# Convert back to pandas
x_over = pd.DataFrame(x_np, columns=X_train.columns)
y_over = pd.Series(y_np, name=y_train.name)
return x_over, y_over