forked from batra-mlp-lab/visdial-rl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataloader.py
380 lines (311 loc) · 13.3 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
import os
import json
import h5py
import numpy as np
import torch
from six import iteritems
from six.moves import range
from sklearn.preprocessing import normalize
from torch.utils.data import Dataset
class VisDialDataset(Dataset):
def __init__(self, params, subsets):
'''
Initialize the dataset with splits given by 'subsets', where
subsets is taken from ['train', 'val', 'test']
Notation:
'dtype' is a split taking values from ['train', 'val', 'test']
'stype' is a sqeuence type from ['ques', 'ans']
'''
# By default, load Q-Bot, A-Bot and dialog options for A-Bot
self.useQuestion = True
self.useAnswer = True
self.useOptions = True
self.useHistory = True
self.useIm = True
# Absorb parameters
for key, value in iteritems(params):
setattr(self, key, value)
self.subsets = tuple(subsets)
self.numRounds = params['numRounds']
print('\nDataloader loading json file: ' + self.inputJson)
with open(self.inputJson, 'r') as fileId:
info = json.load(fileId)
# Absorb values
for key, value in iteritems(info):
setattr(self, key, value)
wordCount = len(self.word2ind)
# Add <START> and <END> to vocabulary
self.word2ind['<START>'] = wordCount + 1
self.word2ind['<END>'] = wordCount + 2
self.startToken = self.word2ind['<START>']
self.endToken = self.word2ind['<END>']
# Padding token is at index 0
self.vocabSize = wordCount + 3
print('Vocab size with <START>, <END>: %d' % self.vocabSize)
# Construct the reverse map
self.ind2word = {
int(ind): word
for word, ind in iteritems(self.word2ind)
}
# Read questions, answers and options
print('Dataloader loading h5 file: ' + self.inputQues)
quesFile = h5py.File(self.inputQues, 'r')
if self.useIm:
# Read images
print('Dataloader loading h5 file: ' + self.inputImg)
imgFile = h5py.File(self.inputImg, 'r')
# Number of data points in each split (train/val/test)
self.numDataPoints = {}
self.data = {}
# map from load to save labels
ioMap = {
'ques_%s': '%s_ques',
'ques_length_%s': '%s_ques_len',
'ans_%s': '%s_ans',
'ans_length_%s': '%s_ans_len',
'ans_index_%s': '%s_ans_ind',
'img_pos_%s': '%s_img_pos',
'opt_%s': '%s_opt',
'opt_length_%s': '%s_opt_len',
'opt_list_%s': '%s_opt_list'
}
# Processing every split in subsets
for dtype in subsets: # dtype is in [train, val, test]
print("\nProcessing split [%s]..." % dtype)
if ('ques_%s' % dtype) not in quesFile:
self.useQuestion = False
if ('ans_%s' % dtype) not in quesFile:
self.useAnswer = False
if ('opt_%s' % dtype) not in quesFile:
self.useOptions = False
# read the question, answer, option related information
for loadLabel, saveLabel in iteritems(ioMap):
if loadLabel % dtype not in quesFile:
continue
dataMat = np.array(quesFile[loadLabel % dtype], dtype='int64')
self.data[saveLabel % dtype] = torch.from_numpy(dataMat)
# Read image features, if needed
if self.useIm:
print('Reading image features...')
imgFeats = np.array(imgFile['images_' + dtype])
if not self.imgNorm:
continue
# normalize, if needed
print('Normalizing image features..')
imgFeats = normalize(imgFeats, axis=1, norm='l2')
# save img features
self.data['%s_img_fv' % dtype] = torch.FloatTensor(imgFeats)
# Visdial
if hasattr(self, 'unique_img_train') and params['cocoDir']:
coco_dir = params['cocoDir']
with open(params['cocoInfo'], 'r') as f:
coco_info = json.load(f)
id_to_fname = {
im['id']: im['file_path']
for im in coco_info['images']
}
cocoids = getattr(self, 'unique_img_%s'%dtype)
if '.jpg' not in cocoids[0]:
img_fnames = [
os.path.join(coco_dir, id_to_fname[int(cocoid)])
for cocoid in cocoids
]
else:
img_fnames = cocoids
self.data['%s_img_fnames' % dtype] = img_fnames
# read the history, if needed
if self.useHistory:
captionMap = {
'cap_%s': '%s_cap',
'cap_length_%s': '%s_cap_len'
}
for loadLabel, saveLabel in iteritems(captionMap):
mat = np.array(quesFile[loadLabel % dtype], dtype='int32')
self.data[saveLabel % dtype] = torch.from_numpy(mat)
# Number of data points
self.numDataPoints[dtype] = self.data[dtype + '_cap'].size(0)
# Prepare dataset for training
for dtype in subsets:
print("\nSequence processing for [%s]..." % dtype)
self.prepareDataset(dtype)
print("")
# Default pytorch loader dtype is set to train
if 'train' in subsets:
self._split = 'train'
else:
self._split = subsets[0]
@property
def split(self):
return self._split
@split.setter
def split(self, split):
assert split in self.subsets # ['train', 'val', 'test']
self._split = split
#----------------------------------------------------------------------------
# Dataset preprocessing
#----------------------------------------------------------------------------
def prepareDataset(self, dtype):
if self.useHistory:
self.processCaption(dtype)
# prefix/postfix with <START> and <END>
if self.useOptions:
self.processOptions(dtype)
# options are 1-indexed, changed to 0-indexed
self.data[dtype + '_opt'] -= 1
# process answers and questions
if self.useAnswer:
self.processSequence(dtype, stype='ans')
# 1 indexed to 0 indexed
self.data[dtype + '_ans_ind'] -= 1
if self.useQuestion:
self.processSequence(dtype, stype='ques')
def processSequence(self, dtype, stype='ans'):
'''
Add <START> and <END> token to answers or questions.
Arguments:
'dtype' : Split to use among ['train', 'val', 'test']
'sentType' : Sequence type, either 'ques' or 'ans'
'''
assert stype in ['ques', 'ans']
prefix = dtype + "_" + stype
seq = self.data[prefix]
seqLen = self.data[prefix + '_len']
numConvs, numRounds, maxAnsLen = seq.size()
newSize = torch.Size([numConvs, numRounds, maxAnsLen + 2])
sequence = torch.LongTensor(newSize).fill_(0)
# decodeIn begins with <START>
sequence[:, :, 0] = self.word2ind['<START>']
endTokenId = self.word2ind['<END>']
for thId in range(numConvs):
for rId in range(numRounds):
length = seqLen[thId, rId]
if length == 0:
print('Warning: Skipping empty %s sequence at (%d, %d)'\
%(stype, thId, rId))
continue
sequence[thId, rId, 1:length + 1] = seq[thId, rId, :length]
sequence[thId, rId, length + 1] = endTokenId
# Sequence length is number of tokens + 1
self.data[prefix + "_len"] = seqLen + 1
self.data[prefix] = sequence
def processCaption(self, dtype):
'''
Add <START> and <END> token to caption.
Arguments:
'dtype' : Split to use among ['train', 'val', 'test']
'''
prefix = dtype + '_cap'
seq = self.data[prefix]
seqLen = self.data[prefix + '_len']
numConvs, maxCapLen = seq.size()
newSize = torch.Size([numConvs, maxCapLen + 2])
sequence = torch.LongTensor(newSize).fill_(0)
# decodeIn begins with <START>
sequence[:, 0] = self.word2ind['<START>']
endTokenId = self.word2ind['<END>']
for thId in range(numConvs):
length = seqLen[thId]
if length == 0:
print('Warning: Skipping empty %s sequence at (%d)' % (stype,
thId))
continue
sequence[thId, 1:length + 1] = seq[thId, :length]
sequence[thId, length + 1] = endTokenId
# Sequence length is number of tokens + 1
self.data[prefix + "_len"] = seqLen + 1
self.data[prefix] = sequence
def processOptions(self, dtype):
ans = self.data[dtype + '_opt_list']
ansLen = self.data[dtype + '_opt_len']
ansListLen, maxAnsLen = ans.size()
newSize = torch.Size([ansListLen, maxAnsLen + 2])
options = torch.LongTensor(newSize).fill_(0)
# decodeIn begins with <START>
options[:, 0] = self.word2ind['<START>']
endTokenId = self.word2ind['<END>']
for ansId in range(ansListLen):
length = ansLen[ansId]
if length == 0:
print('Warning: Skipping empty option answer list at (%d)'\
%ansId)
continue
options[ansId, 1:length + 1] = ans[ansId, :length]
options[ansId, length + 1] = endTokenId
self.data[dtype + '_opt_len'] = ansLen + 1
self.data[dtype + '_opt_seq'] = options
#----------------------------------------------------------------------------
# Dataset helper functions for PyTorch's datalaoder
#----------------------------------------------------------------------------
def __len__(self):
# Assert that loader_dtype is in subsets ['train', 'val', 'test']
return self.numDataPoints[self._split]
def __getitem__(self, idx):
item = self.getIndexItem(self._split, idx)
return item
def collate_fn(self, batch):
out = {}
mergedBatch = {key: [d[key] for d in batch] for key in batch[0]}
for key in mergedBatch:
if key == 'img_fname' or key == 'index':
out[key] = mergedBatch[key]
elif key == 'cap_len':
# 'cap_lens' are single integers, need special treatment
out[key] = torch.LongTensor(mergedBatch[key])
else:
out[key] = torch.stack(mergedBatch[key], 0)
# Dynamic shaping of padded batch
if 'ques' in out.keys():
quesLen = out['ques_len'] + 1
out['ques'] = out['ques'][:, :, :torch.max(quesLen)].contiguous()
if 'ans' in out.keys():
ansLen = out['ans_len'] + 1
out['ans'] = out['ans'][:, :, :torch.max(ansLen)].contiguous()
if 'cap' in out.keys():
capLen = out['cap_len'] + 1
out['cap'] = out['cap'][:, :torch.max(capLen)].contiguous()
if 'opt' in out.keys():
optLen = out['opt_len'] + 1
out['opt'] = out['opt'][:, :, :, :torch.max(optLen) + 2].contiguous()
return out
#----------------------------------------------------------------------------
# Dataset indexing
#----------------------------------------------------------------------------
def getIndexItem(self, dtype, idx):
item = {'index': idx}
# get question
if self.useQuestion:
ques = self.data[dtype + '_ques'][idx]
quesLen = self.data[dtype + '_ques_len'][idx]
item['ques'] = ques
item['ques_len'] = quesLen
# get answer
if self.useAnswer:
ans = self.data[dtype + '_ans'][idx]
ansLen = self.data[dtype + '_ans_len'][idx]
item['ans_len'] = ansLen
item['ans'] = ans
# get caption
if self.useHistory:
cap = self.data[dtype + '_cap'][idx]
capLen = self.data[dtype + '_cap_len'][idx]
item['cap'] = cap
item['cap_len'] = capLen
if self.useOptions:
optInds = self.data[dtype + '_opt'][idx]
ansId = self.data[dtype + '_ans_ind'][idx]
optSize = list(optInds.size())
newSize = torch.Size(optSize + [-1])
indVector = optInds.view(-1)
optLens = self.data[dtype + '_opt_len'].index_select(0, indVector)
optLens = optLens.view(optSize)
opts = self.data[dtype + '_opt_seq'].index_select(0, indVector)
item['opt'] = opts.view(newSize)
item['opt_len'] = optLens
item['ans_id'] = ansId
# if image needed
if self.useIm:
item['img_feat'] = self.data[dtype + '_img_fv'][idx]
# item['img_fname'] = self.data[dtype + '_img_fnames'][idx]
if dtype + '_img_labels' in self.data:
item['img_label'] = self.data[dtype + '_img_labels'][idx]
return item