-
Notifications
You must be signed in to change notification settings - Fork 6
/
cdssm.py
181 lines (141 loc) · 7.24 KB
/
cdssm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# Nishant Nikhil ([email protected])
# An implementation of the Deep Semantic Similarity Model (DSSM) found in [1].
# [1] Shen, Y., He, X., Gao, J., Deng, L., and Mesnil, G. 2014. A latent semantic model
# with convolutional-pooling structure for information retrieval. In CIKM, pp. 101-110.
# http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf
# [2] http://research.microsoft.com/en-us/projects/dssm/
# [3] http://research.microsoft.com/pubs/238873/wsdm2015.v3.pdf
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F
LETTER_GRAM_SIZE = 3 # See section 3.2.
WINDOW_SIZE = 3 # See section 3.2.
TOTAL_LETTER_GRAMS = 8246 # Determined from data. See section 3.2.
WORD_DEPTH = WINDOW_SIZE * TOTAL_LETTER_GRAMS # See equation (1).
WORD_DEPTH = 29244 # See equation (1).
# Uncomment it, if testing
# WORD_DEPTH = 1000
CONV_DIM = 1024
K = 300 # Dimensionality of the max-pooling layer. See section 3.4.
L = 128 # Dimensionality of latent semantic space. See section 3.5.
J = 4 # Number of random unclicked documents serving as negative examples for a query. See section 4.
FILTER_LENGTH = 3 # We only consider one time step for convolutions.
def kmax_pooling(x, dim, k):
index = x.topk(k, dim = dim)[1]
index = index.sort(dim = dim)[0]
return x.gather(dim, index)
class CDSSM(nn.Module):
def __init__(self):
super(CDSSM, self).__init__()
# layers for query
self.query_conv = nn.Conv1d(WORD_DEPTH, CONV_DIM, FILTER_LENGTH)
# adding Xavier-He initialization
torch.nn.init.xavier_uniform_(self.query_conv.weight)
# adding a second convolutional layer
self.second_query_conv = nn.Conv1d(CONV_DIM, K, FILTER_LENGTH)
torch.nn.init.xavier_uniform_(self.second_query_conv.weight)
self.max_pool = nn.MaxPool1d(3)
# learn the semantic representation
self.query_sem = nn.Linear(K, L)
torch.nn.init.xavier_uniform_(self.query_sem.weight)
# adding a hidden layer
# self.second_query_sem = nn.Linear(L, L)
# self.second_doc_sem = nn.Linear(L, L)
# dropout for regularization
self.dropout = nn.Dropout(0.4)
print("Using 30% dropout with an extra conv!")
# layers for docs
self.doc_conv = nn.Conv1d(WORD_DEPTH, CONV_DIM, FILTER_LENGTH)
torch.nn.init.xavier_uniform_(self.doc_conv.weight)
self.second_doc_conv = nn.Conv1d(CONV_DIM, K, FILTER_LENGTH)
torch.nn.init.xavier_uniform_(self.second_doc_conv.weight)
self.doc_sem = nn.Linear(K, L)
torch.nn.init.xavier_uniform_(self.doc_sem.weight)
# learning gamma
# self.learn_gamma = nn.Conv1d(1, 1, 1)
# torch.nn.init.xavier_uniform_(self.learn_gamma.weight)
# adding batch norm
self.q_norm = nn.BatchNorm1d(WORD_DEPTH)
self.doc_norm = nn.BatchNorm1d(WORD_DEPTH)
# adding q_norm
self.sem_q_norm = nn.BatchNorm1d(1)
self.sem_doc_norm = nn.BatchNorm1d(1)
self.concat_sem = nn.Linear(2*L, 2)
torch.nn.init.xavier_uniform_(self.concat_sem.weight)
self.softmax = nn.LogSoftmax()
def forward(self, q, pos):
# Query model. The paper uses separate neural nets for queries and documents (see section 5.2).
# To make it compatible with Conv layer we reshape it to: (batch_size, WORD_DEPTH, query_len)
# print("Query initial shape: {}".format(q.shape))
# print("Evidence initial shape: {}".format(pos.shape))
q = q.transpose(1,2)
pos = pos.transpose(1,2)
# print("Query reshape: {}".format(q.shape))
# In this step, we transform each word vector with WORD_DEPTH dimensions into its
# convolved representation with K dimensions. K is the number of kernels/filters
# being used in the operation. Essentially, the operation is taking the dot product
# of a single weight matrix (W_c) with each of the word vectors (l_t) from the
# query matrix (l_Q), adding a bias vector (b_c), and then applying the tanh activation.
# That is, h_Q = tanh(W_c • l_Q + b_c). Note: the paper does not include bias units.
q = self.q_norm(q)
pos = self.doc_norm(pos)
q_c = torch.tanh(self.query_conv(q))
pos_c = torch.tanh(self.doc_conv(pos))
# print("Size after convolution: {}".format(q_c.shape))
q_c = self.max_pool(q_c)
pos_c = self.max_pool(pos_c)
q_c = torch.tanh(self.second_query_conv(q_c))
pos_c = torch.tanh(self.second_doc_conv(pos_c))
# Next, we apply a max-pooling layer to the convolved query matrix.
q_k = kmax_pooling(q_c, 2, 1)
pos_k = kmax_pooling(pos_c, 2, 1)
# print("Size after max pooling: {}".format(q_k.shape))
# q_k = torch.tanh(self.second_query_conv(q_k))
# pos_k = torch.tanh(self.second_doc_conv(pos_k))
q_k = kmax_pooling(q_k, 2, 1)
pos_k = kmax_pooling(pos_k, 2, 1)
q_k = q_k.transpose(1,2)
pos_k = pos_k.transpose(1,2)
q_k = self.dropout(q_k)
pos_k = self.dropout(pos_k)
# print("Size after transpose: {}".format(q_k.shape))
# In this step, we generate the semantic vector represenation of the query. This
# is a standard neural network dense layer, i.e., y = tanh(W_s • v + b_s). Again,
# the paper does not include bias units.
q_k = self.sem_q_norm(q_k)
pos_k = self.sem_doc_norm(pos_k)
q_s = torch.tanh(self.query_sem(q_k))
pos_s = torch.tanh(self.doc_sem(pos_k))
q_s = self.dropout(q_s)
pos_s = self.dropout(pos_s)
# print("Semantic layer shape: {}".format(q_s.shape))
#q_s = q_s.resize(L)
#pos_s = pos_s.resize(L)
# Now let us calculates the cosine similarity between the semantic representations of
# a queries and documents
# dots[0] is the dot-product for positive document, this is necessary to remember
# because we set the target label accordingly
# dots = torch.mm(q_s, pos_s.transpose(0,1)).diag()
# dots = dots / (torch.norm(q_s)*torch.norm(pos_s)) # divide by the norm to make it cosine distance
all_documents = torch.cat([q_s, pos_s], dim=2)
output = torch.tanh(self.concat_sem(all_documents))
output = self.dropout(output)
# dots is a list as of now, lets convert it to torch variable
#dots = torch.stack(dots)
# In this step, we multiply each dot product value by gamma. In the paper, gamma is
# described as a smoothing factor for the softmax function, and it's set empirically
# on a held-out data set. We're going to learn gamma's value by pretending it's
# a single 1 x 1 kernel.
#dots = torch.stack([dots, dots],1)
#dots = dots.unsqueeze(2)
# We transform a scalar into a 3D vector
# with_gamma = self.learn_gamma(dots)
output = self.softmax(output)
# print("Output shape: {}".format(output.shape))
# Finally, we use the softmax function to calculate P(D+|Q).
#prob = F.logsigmoid(with_gamma)
#prob = F.softmax(with_gamma)
return output