-
Notifications
You must be signed in to change notification settings - Fork 1
/
train.py
142 lines (128 loc) · 5.82 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#! /usr/bin/env python
"""
Filename: train.py
Author: Denny Britz, Emily Daniels
Date: May 2016
Purpose: Trains the model on the text specified. This class is a
re-implementation of an RNN Theano example created by Denny Britz:
https://github.com/dennybritz/rnn-tutorial-rnnlm
"""
from __future__ import print_function
import csv
import itertools
import numpy as np
import nltk
import sys
import os
import time
from datetime import datetime
from rnn import RNN
class Train(object):
def __init__(self, vocabulary_size, hidden_dim, learning_rate, nepoch,
enable_training, model_file, train_file):
self.vocabulary_size = vocabulary_size
self.hidden_dim = hidden_dim
self.learning_rate = learning_rate
self.nepoch = nepoch
self.enable_training = enable_training
self.train_file = train_file
self.model_file = model_file
self.unknown_token = "UNKNOWN_TOKEN"
self.sentence_start_token = "SENTENCE_START"
self.sentence_end_token = "SENTENCE_END"
self.sentences = []
self.word_to_index = {}
self.index_to_word = []
self.tokenized_sentences = []
self.split_sentences()
self.tokenize_sentences()
self.model = self.create_training_data()
def train_with_sgd(self, model, X_train, y_train, learning_rate, nepoch,
evaluate_loss_after=5):
losses = []
num_examples_seen = 0
for epoch in range(nepoch):
# optionally evaluate the loss
if epoch % evaluate_loss_after == 0:
loss = model.calculate_loss(X_train, y_train)
losses.append((num_examples_seen, loss))
time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
print("%s: Loss after num_examples_seen=%d epoch=%d: %f" %
(time, num_examples_seen, epoch, loss))
# adjust the learning rate if loss increases
if len(losses) > 1 and losses[-1][1] > losses[-2][1]:
learning_rate *= 0.5
print("Setting learning rate to %f" % learning_rate)
sys.stdout.flush()
self.save_model_parameters("./data/rnn-%s-%d-%d-%s.npz" % (
os.path.basename(self.train_file), model.hidden_dim,
model.word_dim, time), model)
for i in range(len(y_train)):
model.sgd_step(X_train[i], y_train[i], learning_rate)
num_examples_seen += 1
def save_model_parameters(self, outfile, model):
U, V, W = model.U.get_value(), model.V.get_value(), model.W.get_value()
np.savez(outfile, U=U, V=V, W=W)
print("Saved model parameters to %s." % outfile)
def load_model_parameters(self, path, model):
npzfile = np.load(path)
U, V, W = npzfile["U"], npzfile["V"], npzfile["W"]
model.hidden_dim = U.shape[0]
model.word_dim = U.shape[1]
model.U.set_value(U)
model.V.set_value(V)
model.W.set_value(W)
print("Loaded model parameters from %s. hidden_dim=%d word_dim=%d" %
(path, U.shape[0], U.shape[1]))
def split_sentences(self):
print("Reading CSV file...")
with open(self.train_file, 'rU') as f:
reader = csv.reader(f, skipinitialspace=True)
reader.next()
# extra decoding to account for non UTF-8 characters
self.sentences = itertools.chain(*[nltk.sent_tokenize(
x[0].decode('latin-1').encode('utf-8').decode('utf-8').lower())
for x in reader])
self.sentences = ["%s %s %s" % (
self.sentence_start_token, x, self.sentence_end_token) for x in
self.sentences]
print("Parsed %d sentences." % (len(self.sentences)))
def tokenize_sentences(self):
# tokenize the sentences into words and count the word frequencies
# get most common words, build index_to_word and word_to_index vectors
self.tokenized_sentences = [nltk.word_tokenize(sent) for sent in
self.sentences]
word_freq = nltk.FreqDist(itertools.chain(*self.tokenized_sentences))
print("Found %d unique word tokens." % len(word_freq.items()))
vocab = word_freq.most_common(self.vocabulary_size - 1)
self.index_to_word = [x[0] for x in vocab]
self.index_to_word.append(self.unknown_token)
self.word_to_index = dict(
[(w, i) for i, w in enumerate(self.index_to_word)])
print("Using vocabulary size %d." % self.vocabulary_size)
print(
"The least frequent word is '%s' appearing %d times." % (
vocab[-1][0], vocab[-1][1]))
# replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(self.tokenized_sentences):
self.tokenized_sentences[i] = [
w if w in self.word_to_index else self.unknown_token for w in
sent]
def create_training_data(self):
X_train = np.asarray(
[[self.word_to_index[w] for w in sent[:-1]] for sent in
self.tokenized_sentences])
y_train = np.asarray(
[[self.word_to_index[w] for w in sent[1:]] for sent in
self.tokenized_sentences])
model = RNN(self.vocabulary_size, self.hidden_dim, 4)
t1 = time.time()
model.sgd_step(X_train[10], y_train[10], self.learning_rate)
t2 = time.time()
print("SGD step time: %f milliseconds" % ((t2 - t1) * 1000.))
if self.model_file is not None:
self.load_model_parameters(self.model_file, model)
if self.enable_training:
self.train_with_sgd(model, X_train, y_train, self.learning_rate,
self.nepoch)
return model