-
Notifications
You must be signed in to change notification settings - Fork 38
/
SceneDesc.py
119 lines (96 loc) · 3.52 KB
/
SceneDesc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector, Merge, Activation
from keras.preprocessing import image, sequence
import cPickle as pickle
EMBEDDING_DIM = 128
class scenedesc():
def __init__(self):
self.vocab_size = None
self.no_samples = None
self.max_length = None
self.index_word = None
self.word_index = None
self.image_encodings = pickle.load( open( "image_encodings.p", "rb" ) )
self.captions = None
self.img_id = None
self.values()
def values(self):
dataframe = pd.read_csv('Flickr8K_Text/trainimgs.txt', delimiter='\t')
self.captions = []
self.img_id = []
iter = dataframe.iterrows()
for i in range(len(dataframe)):
nextiter = iter.next()
self.captions.append(nextiter[1][1])
self.img_id.append(nextiter[1][0])
self.no_samples=0
tokens = []
for caption in self.captions:
self.no_samples+=len(caption.split())-1
tokens.append(caption.split())
vocab = []
for token in tokens:
vocab.extend(token)
print len(vocab)
vocab = list(set(vocab))
self.vocab_size = len(vocab)
caption_length = [len(caption.split()) for caption in self.captions]
self.max_length = max(caption_length)
self.word_index = {}
self.index_word = {}
for i, word in enumerate(vocab):
self.word_index[word]=i
self.index_word[i]=word
def data_process(self, batch_size):
partial_captions = []
next_words = []
images = []
total_count = 0
while 1:
image_counter = -1
for caption in self.captions:
image_counter+=1
current_image = self.image_encodings[self.img_id[image_counter]]
for i in range(len(caption.split())-1):
total_count+=1
partial = [self.word_index[txt] for txt in caption.split()[:i+1]]
partial_captions.append(partial)
next = np.zeros(self.vocab_size)
next[self.word_index[caption.split()[i+1]]] = 1
next_words.append(next)
images.append(current_image)
if total_count>=batch_size:
next_words = np.asarray(next_words)
images = np.asarray(images)
partial_captions = sequence.pad_sequences(partial_captions, maxlen=self.max_length, padding='post')
total_count = 0
yield [[images, partial_captions], next_words]
partial_captions = []
next_words = []
images = []
def load_image(self, path):
img = image.load_img(path, target_size=(224,224))
x = image.img_to_array(img)
return np.asarray(x)
def create_model(self, ret_model = False):
image_model = Sequential()
image_model.add(Dense(EMBEDDING_DIM, input_dim = 4096, activation='relu'))
image_model.add(RepeatVector(self.max_length))
lang_model = Sequential()
lang_model.add(Embedding(self.vocab_size, 256, input_length=self.max_length))
lang_model.add(LSTM(256,return_sequences=True))
lang_model.add(TimeDistributed(Dense(EMBEDDING_DIM)))
model = Sequential()
model.add(Merge([image_model, lang_model], mode='concat'))
model.add(LSTM(1000,return_sequences=False))
model.add(Dense(self.vocab_size))
model.add(Activation('softmax'))
print ("Model created!")
if(ret_model==True):
return model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
return model
def get_word(self,index):
return self.index_word[index]