-
Notifications
You must be signed in to change notification settings - Fork 0
/
prac_lstm.py
executable file
·200 lines (163 loc) · 7.68 KB
/
prac_lstm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#! /usr/bin/python
"""Short and sweet LSTM implementation in Tensorflow.
Motivation:
When Tensorflow was released, adding RNNs was a bit of a hack - it required
building separate graphs for every number of timesteps and was a bit obscure
to use. Since then TF devs added things like `dynamic_rnn`, `scan` and `map_fn`.
Currently the APIs are decent, but all the tutorials that I am aware of are not
making the best use of the new APIs.
Advantages of this implementation:
- No need to specify number of timesteps ahead of time. Number of timesteps is
infered from shape of input tensor. Can use the same graph for multiple
different numbers of timesteps.
- No need to specify batch size ahead of time. Batch size is infered from shape
of input tensor. Can use the same graph for multiple different batch sizes.
- Easy to swap out different recurrent gadgets (RNN, LSTM, GRU, your new
creative idea)
"""
import numpy as np
import random
import sys
import tensorflow as tf
import tensorflow.contrib.layers as layers
from basic_classif import *
map_fn = tf.map_fn
################################################################################
## DATASET GENERATION ##
## ##
## The problem we are trying to solve is adding two binary numbers. The ##
## numbers are reversed, so that the state of RNN can add the numbers ##
## perfectly provided it can learn to store carry in the state. Timestep t ##
## corresponds to bit len(number) - t. ##
################################################################################
def as_bytes(num, final_size):
res = []
for _ in range(final_size):
res.append(num % 2)
num //= 2
return res
def generate_example(num_bits):
a = random.randint(0, 2**(num_bits - 1) - 1)
b = random.randint(0, 2**(num_bits - 1) - 1)
res = a + b
return (as_bytes(a, num_bits),
as_bytes(b, num_bits),
as_bytes(res,num_bits))
def generate_batch(num_bits, batch_size):
"""Generates instance of a problem.
Returns
-------
x: np.array
two numbers to be added represented by bits.
shape: b, i, n
where:
b is bit index from the end
i is example idx in batch
n is one of [0,1] depending for first and
second summand respectively
y: np.array
the result of the addition
shape: b, i, n
where:
b is bit index from the end
i is example idx in batch
n is always 0
"""
x = np.empty((num_bits, batch_size, 2))
y = np.empty((num_bits, batch_size, 1))
for i in range(batch_size):
a, b, r = generate_example(num_bits)
x[:, i, 0] = a
x[:, i, 1] = b
y[:, i, 0] = r
return x, y
################################################################################
## GRAPH DEFINITION ##
################################################################################
INPUT_SIZE = 2 # 2 bits per timestep
RNN_HIDDEN = 20
OUTPUT_SIZE = 1 # 1 bit per timestep
TINY = 1e-6 # to avoid NaNs in logs
LEARNING_RATE = 0.01
USE_LSTM = True
# inputs = tf.placeholder(tf.float32, (None, None, INPUT_SIZE)) # (time, batch, in)
inputs = tf.placeholder(tf.float32, (None, None)) # (time, batch, in)
# outputs = tf.placeholder(tf.float32, (None, None, OUTPUT_SIZE)) # (time, batch, out)
outputs = tf.placeholder(tf.float32, (None, None)) # (time, batch, out)
## Here cell can be any function you want, provided it has two attributes:
# - cell.zero_state(batch_size, dtype)- tensor which is an initial value
# for state in __call__
# - cell.__call__(input, state) - function that given input and previous
# state returns tuple (output, state) where
# state is the state passed to the next
# timestep and output is the tensor used
# for infering the output at timestep. For
# example for LSTM, output is just hidden,
# but state is memory + hidden
# Example LSTM cell with learnable zero_state can be found here:
# https://gist.github.com/nivwusquorum/160d5cf7e1e82c21fad3ebf04f039317
if USE_LSTM:
cell = tf.contrib.rnn.BasicLSTMCell(RNN_HIDDEN, state_is_tuple=True)
else:
cell = tf.contrib.rnn.BasicRNNCell(RNN_HIDDEN)
# Create initial state. Here it is just a constant tensor filled with zeros,
# but in principle it could be a learnable parameter. This is a bit tricky
# to do for LSTM's tuple state, but can be achieved by creating two vector
# Variables, which are then tiled along batch dimension and grouped into tuple.
batch_size = tf.shape(inputs)[1]
initial_state = cell.zero_state(batch_size, tf.float32)
# Given inputs (time, batch, input_size) outputs a tuple
# - outputs: (time, batch, output_size) [do not mistake with OUTPUT_SIZE]
# - states: (time, batch, hidden_size)
rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, time_major=True)
# project output from rnn output size to OUTPUT_SIZE. Sometimes it is worth adding
# an extra layer here.
final_projection = lambda x: layers.linear(x, num_outputs=OUTPUT_SIZE, activation_fn=tf.nn.sigmoid)
# apply projection to every timestep.
predicted_outputs = map_fn(final_projection, rnn_outputs)
# compute elementwise cross entropy.
error = -(outputs * tf.log(predicted_outputs + TINY) + (1.0 - outputs) * tf.log(1.0 - predicted_outputs + TINY))
error = tf.reduce_mean(error)
# optimize
train_fn = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(error)
# assuming that absolute difference between output and correct answer is 0.5
# or less we can round it to the correct output.
accuracy = tf.reduce_mean(tf.cast(tf.abs(outputs - predicted_outputs) < 0.5, tf.float32))
################################################################################
## TRAINING LOOP ##
################################################################################
# NUM_BITS = 10
# ITERATIONS_PER_EPOCH = 100
# BATCH_SIZE = 16
BATCH_SIZE = 10
# valid_x, valid_y = generate_batch(num_bits=NUM_BITS, batch_size=100)
train_x, train_y, valid_x, valid_y, valid_y_expanded = makeData()
ITERATIONS_PER_EPOCH = train_x.shape[0] // BATCH_SIZE
print "Partitioned size of data for training"
print train_x.shape
print train_y.shape
print "Partitioned size of data for testing"
print valid_x.shape
print valid_y.shape
session = tf.Session()
# For some reason it is our job to do this:
session.run(tf.global_variables_initializer())
for epoch in range(1000):
epoch_error = 0
for i in range(ITERATIONS_PER_EPOCH):
# here train_fn is what triggers backprop. error and accuracy on their
# own do not trigger the backprop.
# x, y = generate_batch(num_bits=NUM_BITS, batch_size=BATCH_SIZE)
x = train_x[i*BATCH_SIZE:i*BATCH_SIZE + BATCH_SIZE]
y = train_y[i*BATCH_SIZE:i*BATCH_SIZE + BATCH_SIZE]
print len(x), len(x[0]), len(y), len(y[0])
epoch_error += session.run([error, train_fn], {
inputs: x,
outputs: y,
})[0]
epoch_error /= ITERATIONS_PER_EPOCH
valid_accuracy = session.run(accuracy, {
inputs: valid_x,
outputs: valid_y,
})
print "Epoch %d, train error: %.2f, valid accuracy: %.1f %%" % (epoch, epoch_error, valid_accuracy * 100.0)