This repository has been archived by the owner on Jun 22, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 58
/
Copy pathpipeline_config.py
360 lines (355 loc) · 23.1 KB
/
pipeline_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
import os
from attrdict import AttrDict
from deepsense import neptune
from utils import read_params, multi_roc_auc_score
ctx = neptune.Context()
params = read_params(ctx)
X_COLUMNS = ['comment_text_english']
Y_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
CV_LABELS = ['toxic']
ID_LABEL = ['id']
SOLUTION_CONFIG = AttrDict({
'env': {'cache_dirpath': params.experiment_dir},
'xy_splitter': {'x_columns': X_COLUMNS,
'y_columns': Y_COLUMNS
},
'text_cleaner': {'drop_punctuation': bool(params.drop_punctuation),
'drop_newline': bool(params.drop_newline),
'drop_multispaces': bool(params.drop_multispaces),
'all_lower_case': bool(params.all_lower_case),
'fill_na_with': params.fill_na_with,
'deduplication_threshold': params.deduplication_threshold,
'anonymize': bool(params.anonymize),
'apostrophes': bool(params.apostrophes),
'use_stopwords': bool(params.use_stopwords)
},
'bad_word_filter': {'word_list_filepath': params.bad_words_filepath},
'char_tokenizer': {'char_level': True,
'maxlen': params.maxlen_char,
'num_words': params.max_features_char
},
'word_tokenizer': {'char_level': False,
'maxlen': params.maxlen_words,
'num_words': params.max_features_word
},
'tfidf_char_vectorizer': {'sublinear_tf': True,
'strip_accents': 'unicode',
'analyzer': 'char',
'token_pattern': r'\w{1,}',
'ngram_range': (1, params.char_ngram_max),
'max_features': params.max_features_char
},
'tfidf_word_vectorizer': {'sublinear_tf': True,
'strip_accents': 'unicode',
'analyzer': 'word',
'token_pattern': r'\w{1,}',
'ngram_range': (1, 1),
'max_features': params.max_features_word
},
'embeddings': {'pretrained_filepath': params.embedding_filepath,
'max_features': params.max_features_word,
'embedding_size': params.word_embedding_size
},
'dpcnn_network': {
'architecture_config': {'model_params': {'max_features': params.max_features_word,
'maxlen': params.maxlen_words,
'embedding_size': params.word_embedding_size,
'trainable_embedding': bool(params.trainable_embedding),
'filter_nr': params.filter_nr,
'kernel_size': params.kernel_size,
'repeat_block': params.repeat_block,
'dense_size': params.dense_size,
'repeat_dense': params.repeat_dense,
'max_pooling': bool(params.max_pooling),
'mean_pooling': bool(params.mean_pooling),
'weighted_average_attention': bool(params.weighted_average_attention),
'concat_mode': params.concat_mode,
'dropout_embedding': params.dropout_embedding,
'conv_dropout': params.conv_dropout,
'dense_dropout': params.dense_dropout,
'dropout_mode': params.dropout_mode,
'conv_kernel_reg_l2': params.conv_kernel_reg_l2,
'conv_bias_reg_l2': params.conv_bias_reg_l2,
'dense_kernel_reg_l2': params.dense_kernel_reg_l2,
'dense_bias_reg_l2': params.dense_bias_reg_l2,
'use_prelu': bool(params.use_prelu),
'use_batch_norm': bool(params.use_batch_norm),
'batch_norm_first': bool(params.batch_norm_first),
},
'optimizer_params': {'lr': params.lr,
'momentum': params.momentum,
'nesterov': True
},
},
'training_config': {'epochs': params.epochs_nr,
'shuffle': True,
'batch_size': params.batch_size_train,
},
'callbacks_config': {'model_checkpoint': {
'filepath': os.path.join(params.experiment_dir, 'checkpoints',
'dpcnn_network',
'best_model.h5'),
'save_best_only': True,
'save_weights_only': False},
'lr_scheduler': {'gamma': params.gamma},
'early_stopping': {'patience': params.patience},
'neptune_monitor': {},
},
},
'scnn_network': {
'architecture_config': {'model_params': {'max_features': params.max_features_word,
'maxlen': params.maxlen_words,
'embedding_size': params.word_embedding_size,
'trainable_embedding': bool(params.trainable_embedding),
'filter_nr': params.filter_nr,
'kernel_size': params.kernel_size,
'repeat_block': params.repeat_block,
'dense_size': params.dense_size,
'repeat_dense': params.repeat_dense,
'max_pooling': bool(params.max_pooling),
'mean_pooling': bool(params.mean_pooling),
'weighted_average_attention': bool(params.weighted_average_attention),
'concat_mode': params.concat_mode,
'dropout_embedding': params.dropout_embedding,
'conv_dropout': params.conv_dropout,
'dense_dropout': params.dense_dropout,
'dropout_mode': params.dropout_mode,
'conv_kernel_reg_l2': params.conv_kernel_reg_l2,
'conv_bias_reg_l2': params.conv_bias_reg_l2,
'dense_kernel_reg_l2': params.dense_kernel_reg_l2,
'dense_bias_reg_l2': params.dense_bias_reg_l2,
'use_prelu': bool(params.use_prelu),
'use_batch_norm': bool(params.use_batch_norm),
'batch_norm_first': bool(params.batch_norm_first),
},
'optimizer_params': {'lr': params.lr,
'momentum': params.momentum,
'nesterov': True
},
},
'training_config': {'epochs': params.epochs_nr,
'shuffle': True,
'batch_size': params.batch_size_train,
},
'callbacks_config': {'model_checkpoint': {
'filepath': os.path.join(params.experiment_dir, 'checkpoints',
'scnn_network',
'best_model.h5'),
'save_best_only': True,
'save_weights_only': False},
'lr_scheduler': {'gamma': params.gamma},
'early_stopping': {'patience': params.patience},
'neptune_monitor': {},
},
},
'lstm_network': {
'architecture_config': {'model_params': {'max_features': params.max_features_word,
'maxlen': params.maxlen_words,
'embedding_size': params.word_embedding_size,
'trainable_embedding': bool(params.trainable_embedding),
'unit_nr': params.filter_nr,
'repeat_block': params.repeat_block,
'max_pooling': bool(params.max_pooling),
'mean_pooling': bool(params.mean_pooling),
'weighted_average_attention': bool(params.weighted_average_attention),
'concat_mode': params.concat_mode,
'dense_size': params.dense_size,
'repeat_dense': params.repeat_dense,
'dropout_embedding': params.dropout_embedding,
'rnn_dropout': params.rnn_dropout,
'dense_dropout': params.dense_dropout,
'dropout_mode': params.dropout_mode,
'rnn_kernel_reg_l2': params.rnn_kernel_reg_l2,
'rnn_recurrent_reg_l2': params.rnn_kernel_reg_l2,
'rnn_bias_reg_l2': params.rnn_bias_reg_l2,
'dense_kernel_reg_l2': params.dense_kernel_reg_l2,
'dense_bias_reg_l2': params.dense_bias_reg_l2,
'use_prelu': bool(params.use_prelu),
'use_batch_norm': bool(params.use_batch_norm),
'batch_norm_first': bool(params.batch_norm_first),
},
'optimizer_params': {'lr': params.lr,
'momentum': params.momentum,
'nesterov': True
},
},
'training_config': {'epochs': params.epochs_nr,
'batch_size': params.batch_size_train,
},
'callbacks_config': {'model_checkpoint': {
'filepath': os.path.join(params.experiment_dir, 'checkpoints',
'lstm_network',
'best_model.h5'),
'save_best_only': True,
'save_weights_only': False},
'lr_scheduler': {'gamma': params.gamma},
'early_stopping': {'patience': params.patience},
'neptune_monitor': {},
},
},
'gru_network': {
'architecture_config': {'model_params': {'max_features': params.max_features_word,
'maxlen': params.maxlen_words,
'embedding_size': params.word_embedding_size,
'trainable_embedding': bool(params.trainable_embedding),
'unit_nr': params.filter_nr,
'repeat_block': params.repeat_block,
'max_pooling': bool(params.max_pooling),
'mean_pooling': bool(params.mean_pooling),
'weighted_average_attention': bool(params.weighted_average_attention),
'concat_mode': params.concat_mode,
'dense_size': params.dense_size,
'repeat_dense': params.repeat_dense,
'dropout_embedding': params.dropout_embedding,
'rnn_dropout': params.rnn_dropout,
'dense_dropout': params.dense_dropout,
'dropout_mode': params.dropout_mode,
'rnn_kernel_reg_l2': params.rnn_kernel_reg_l2,
'rnn_recurrent_reg_l2': params.rnn_kernel_reg_l2,
'rnn_bias_reg_l2': params.rnn_bias_reg_l2,
'dense_kernel_reg_l2': params.dense_kernel_reg_l2,
'dense_bias_reg_l2': params.dense_bias_reg_l2,
'use_prelu': bool(params.use_prelu),
'use_batch_norm': bool(params.use_batch_norm),
'batch_norm_first': bool(params.batch_norm_first),
},
'optimizer_params': {'lr': params.lr,
'momentum': params.momentum,
'nesterov': True
},
},
'training_config': {'epochs': params.epochs_nr,
'batch_size': params.batch_size_train,
},
'callbacks_config': {'model_checkpoint': {
'filepath': os.path.join(params.experiment_dir, 'checkpoints',
'gru_network',
'best_model.h5'),
'save_best_only': True,
'save_weights_only': False},
'lr_scheduler': {'gamma': params.gamma},
'early_stopping': {'patience': params.patience},
'neptune_monitor': {},
},
},
'char_vdcnn_network': {
'architecture_config': {'model_params': {'max_features': params.max_features_char,
'maxlen': params.maxlen_char,
'embedding_size': params.char_embedding_size,
'filter_nr': params.filter_nr,
'kernel_size': params.kernel_size,
'repeat_block': params.repeat_block,
'dense_size': params.dense_size,
'repeat_dense': params.repeat_dense,
'max_pooling': bool(params.max_pooling),
'mean_pooling': bool(params.mean_pooling),
'weighted_average_attention': bool(params.weighted_average_attention),
'concat_mode': params.concat_mode,
'dropout_embedding': params.dropout_embedding,
'conv_dropout': params.conv_dropout,
'dense_dropout': params.dense_dropout,
'dropout_mode': params.dropout_mode,
'conv_kernel_reg_l2': params.conv_kernel_reg_l2,
'conv_bias_reg_l2': params.conv_bias_reg_l2,
'dense_kernel_reg_l2': params.dense_kernel_reg_l2,
'dense_bias_reg_l2': params.dense_bias_reg_l2,
'use_prelu': bool(params.use_prelu),
'use_batch_norm': bool(params.use_batch_norm),
'batch_norm_first': bool(params.batch_norm_first),
},
'optimizer_params': {'lr': params.lr,
'momentum': params.momentum,
'nesterov': True
},
},
'training_config': {'epochs': params.epochs_nr,
'batch_size': params.batch_size_train,
},
'callbacks_config': {'model_checkpoint': {
'filepath': os.path.join(params.experiment_dir, 'checkpoints',
'char_vdcnn_network',
'best_model.h5'),
'save_best_only': True,
'save_weights_only': False},
'lr_scheduler': {'gamma': params.gamma},
'early_stopping': {'patience': params.patience},
'neptune_monitor': {},
},
},
'rnn_stacker': {
'architecture_config': {'model_params': {'unit_nr': params.filter_nr,
'repeat_block': params.repeat_block,
'max_pooling': bool(params.max_pooling),
'mean_pooling': bool(params.mean_pooling),
'weighted_average_attention': bool(params.weighted_average_attention),
'concat_mode': params.concat_mode,
'dense_size': params.dense_size,
'repeat_dense': params.repeat_dense,
'dropout_embedding': params.dropout_embedding,
'rnn_dropout': params.rnn_dropout,
'dense_dropout': params.dense_dropout,
'dropout_mode': params.dropout_mode,
'rnn_kernel_reg_l2': params.rnn_kernel_reg_l2,
'rnn_recurrent_reg_l2': params.rnn_kernel_reg_l2,
'rnn_bias_reg_l2': params.rnn_bias_reg_l2,
'dense_kernel_reg_l2': params.dense_kernel_reg_l2,
'dense_bias_reg_l2': params.dense_bias_reg_l2,
'use_prelu': bool(params.use_prelu),
'use_batch_norm': bool(params.use_batch_norm),
'batch_norm_first': bool(params.batch_norm_first),
},
'optimizer_params': {'lr': params.lr,
'momentum': params.momentum,
'nesterov': True
},
},
'training_config': {'epochs': params.epochs_nr,
'batch_size': params.batch_size_train,
},
'callbacks_config': {'model_checkpoint': {
'filepath': os.path.join(params.experiment_dir, 'checkpoints',
'stacker_gru',
'best_model.h5'),
'save_best_only': True,
'save_weights_only': False},
'lr_scheduler': {'gamma': params.gamma},
'early_stopping': {'patience': params.patience},
'neptune_monitor': {},
},
},
'logistic_regression_multilabel': {'label_nr': 6,
'C': params.log_reg_c,
'penalty': params.log_reg_penalty,
'solver': 'sag',
'max_iter': params.max_iter,
'n_jobs': params.num_workers,
},
'catboost_ensemble': {'label_nr': 6,
'iterations': params.catboost__iterations,
'learning_rate': params.catboost__learning_rate,
'depth': params.catboost__depth,
'l2_leaf_reg': params.catboost__l2_leaf_reg,
'border_count': params.catboost__border_count,
'verbose': bool(params.catboost__verbose),
},
'blender_ensemble': {'func': multi_roc_auc_score,
'min': False,
'method': params.blender__method,
'runs': params.blender__runs,
'maxiter': params.blender__maxiter},
'xgboost_ensemble': {'label_nr': 6,
'objective': params.xgboost__objective,
'eval_metric': params.xgboost__eval_metric,
'n_estimators': params.xgboost__n_estimators,
'learning_rate': params.xgboost__learning_rate,
'max_depth': params.xgboost__max_depth,
'min_child_weight': params.xgboost__min_child_weight,
'gamma': params.xgboost__gamma,
'colsample_bytree': params.xgboost__colsample_bytree,
'subsample': params.xgboost__subsample,
'reg_lambda': params.xgboost__reg_lambda,
'reg_alpha': params.xgboost__reg_alpha,
'n_jobs': params.num_workers},
'clipper': {'lower': eval(params.clipper__lower),
'upper': eval(params.clipper__upper)}
})