-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathstate.py
executable file
·188 lines (142 loc) · 6.43 KB
/
state.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
from nlp_tools import NLPTools
from collections import OrderedDict
def prototype_state():
state = {}
# Random seed
state['seed'] = 1234
# Logging level
state['level'] = 'DEBUG'
# String for unknown (out-of-vocabulary) tokens
state['oov'] = '<unk>'
# Number of tokens before and after the potential speaker segmentation token to condition on.
state['max_length'] = 20
# Special tokens need to be hardcoded, because model architecture may adapt depending on these
state['unk_sym'] = 0 # Unknown word token <unk>
state['eos_sym'] = 1 # end-of-utterance symbol </s>
state['eod_sym'] = 2 # end-of-dialogue symbol </d>
state['first_speaker_sym'] = 3 # first speaker symbol <first_speaker>
state['second_speaker_sym'] = 4 # second speaker symbol <second_speaker>
state['third_speaker_sym'] = 5 # third speaker symbol <third_speaker>
state['minor_speaker_sym'] = 6 # minor speaker symbol <minor_speaker>
state['voice_over_sym'] = 7 # voice over symbol <voice_over>
state['off_screen_sym'] = 8 # off screen symbol <off_screen>
state['pause_sym'] = 9 # pause sybmol <pause>, double check that this is correct for every new dictionary
state['segmentation_token_count'] = 6
# ----- ACTIVATION FUNCTION ----
state['rec_activation'] = 'lambda x: T.tanh(x)'
# Gating of encoder RNNs: either 'plain' (tanh activation function) or 'gated' (GRU activation function)
state['sent_step_type'] = 'gated'
# If on, will run four RNNs:
# - One RNN forwards on the token sequence before the segmentation label
# - One RNN backwards on the token sequence before the segmentation label
# - One RNN forwards on the token sequence after the segmentation label
# - One RNN backwards on the token sequence after the segmentation label
state['bidirectional_encoder'] = False
# Dimensionality of the hidden layer of MLP
# (which takes as input the hidden states of the two RNNs)
state['mlp_out_dim'] = 50
state['mlp_activation_function'] = 'linear' # 'rectifier', 'tanh' or 'linear'
# If on, the RNN encoder hidden states will be multiplied by each other elementwise to give a new vector,
# which is then given as input to the MLP.
state['multiplicative_input_from_encoders'] = True
# Dimensionality of hidden layers
state['qdim'] = 512
# Dimensionality of low-rank approximation
state['rankdim'] = 256
# Threshold to clip the gradient
state['cutoff'] = 1
# Learning rate for Adam
state['lr'] = 0.0001
# Early stopping configuration
state['patience'] = 5
state['cost_threshold'] = 1.003
# Initialization configuration
# Whether to initialize from word embeddings (e.g. Word2Vec embeddings)
state['initialize_from_pretrained_word_embeddings'] = False
# Word embeddings file (can be produced by running the "convert-wordemb-dict2emb-matrix" script)
state['pretrained_word_embeddings_file'] = ''
# Whether or not to keep word embeddings fixed during training
state['fix_pretrained_word_embeddings'] = False
# ----- TRAINING METHOD -----
# Choose optimization algorithm
state['updater'] = 'adam'
# Batch size
state['bs'] = 80
# Sort by length groups of batches
state['sort_k_batches'] = 20
# Modify this in each specific prototype
state['save_dir'] = './'
# ----- TRAINING PROCESS -----
# Frequency of training error reports (in number of batches)
state['train_freq'] = 10
# Validation frequency
state['valid_freq'] = 5000
# Number of batches to process
state['loop_iters'] = 3000000
# Maximum number of minutes to run
state['time_stop'] = 24*60*31
# Error level to stop at
state['minerr'] = -1
# ----- EVALUATION PROCESS -----
# Signs used to denote end-of-sentence tokens
state['sentence_break_signs'] = ['.', '!', '?', '</s>']
# Use precomputed features
state['use_precomputed_features'] = True
state['precomputed_features_count'] = 80
# Use RNN features (can be disabled to run logistic regression on precomputed features only)
state['use_rnn_features'] = True
# Condition softmax layer (logistic regression) on previous speaker class
state['condition_on_previous_speaker_class'] = False
return state
# This prototype is only for testing the model...
def prototype_test():
state = prototype_state()
# Fill your paths here!
state['train_dialogues'] = "./tests/data/ttrain.dialogues.pkl"
state['test_dialogues'] = "./tests/data/ttrain.dialogues.pkl"
state['valid_dialogues'] = "./tests/data/ttrain.dialogues.pkl"
state['dictionary'] = "./tests/data/ttrain.dict.pkl"
state['save_dir'] = "./tests/models/"
# Handle pretrained word embeddings. Using this requires rankdim=10
state['initialize_from_pretrained_word_embeddings'] = False
state['pretrained_word_embeddings_file'] = './tests/data/MT_WordEmb.pkl'
state['fix_pretrained_word_embeddings'] = False
# Validation frequency
state['valid_freq'] = 50
state['prefix'] = "testmodel_"
state['updater'] = 'adam'
state['sent_step_type'] = 'gated'
state['bidirectional_encoder'] = False
# If out of memory, modify this!
state['bs'] = 10
state['sort_k_batches'] = 1
state['qdim'] = 20
state['rankdim'] = 10
state['nlp_tools'] = NLPTools(state)
return state
def prototype_movies():
state = prototype_state()
# Fill your paths here!
state['train_dialogues'] = "Data/Training.dialogues.pkl"
state['test_dialogues'] = "Data/Test.dialogues.pkl"
state['valid_dialogues'] = "Data/Validation.dialogues.pkl"
state['dictionary'] = "Data/Dataset.dict.pkl"
state['save_dir'] = "Output"
# Handle pretrained word embeddings.
# These need to be recomputed if we want them for the 20K vocabulary.
state['initialize_from_pretrained_word_embeddings'] = True
state['pretrained_word_embeddings_file'] = 'Data/Word2Vec_WordEmb_50Dim.pkl'
state['fix_pretrained_word_embeddings'] = True
# Validation frequency
state['valid_freq'] = 1250
state['prefix'] = "MovieScriptModel_"
state['updater'] = 'adam'
# If out of memory, modify this!
state['bs'] = 80
state['qdim'] = 100
state['rankdim'] = 50
# Use precomputed features
state['use_precomputed_features'] = True
state['precomputed_features_count'] = 80
state['nlp_tools'] = NLPTools(state)
return state