-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_titanic.py
60 lines (40 loc) · 1.5 KB
/
train_titanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import preprocess_titanic as pf
import config_titanic as config
import warnings
warnings.simplefilter(action='ignore')
# ================================================
# TRAINING STEP - IMPORTANT TO PERPETUATE THE MODEL
# Load data
data = pf.load_data(config.PATH_TO_DATASET)
# divide data set
X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET)
# replace '?' with nan
X_train = pf.replace(X_train)
# get first cabin
X_train['cabin'] = X_train['cabin'].apply(pf.get_first_cabin)
# get title
X_train['title'] = X_train[config.NAME].apply(pf.get_title)
# cast numerical variables into float
for var in config.NUMERICAL_CAST :
X_train[var] = pf.cast_numerical(X_train,var)
# impute numerical missing values
for var in config.NUMERICAL_TO_IMPUTE :
X_train[var] = pf.impute_numerical(X_train,var)
# Group rare labels
for var in config.CATEGORICAL_TO_ONEHOT:
X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var])
# crate dummy variables
X_train = pf.dummy_variables(X_train,config.CATEGORICAL_TO_ONEHOT)
# drop unnecessary features
X_train = X_train.drop(columns = config.DROP_VAR)
# train scaler and save
scaler = pf.train_scaler(X_train,
config.OUTPUT_SCALER_PATH)
# scale train set
X_train = scaler.transform(X_train)
# train model and save
pf.train_model(X_train,
y_train,
config.OUTPUT_MODEL_PATH)
print('Finished training')