-
Notifications
You must be signed in to change notification settings - Fork 1
/
structures.py
358 lines (302 loc) · 14.4 KB
/
structures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
import ir_datasets
import os
import numpy as np
import pickle
import time
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
class Document:
""" Represents a document across the collections """
def __init__(self, id:int, text:str, title=''):
self.id = id
self.text = text
self.title = title
# it's indexed terms
self.terms_list = []
class Query:
""" Represents a query across the collections """
def __init__(self, id:int, query:str):
self.id = id
self.query = query
# the relevant documents for this query
self.docs_relevance = []
class Std_tokenizer:
""" This class contains the common behavior among the collections regarding documents processing """
stop_words = list(set(stopwords.words("english"))) # nltk stopwords
lemmatizer = WordNetLemmatizer() # nltk lemmatizer
def docs_process(docs_list):
"""
Receive a list of Document and tokenize each one.
Returns a list of Document, now with the indexed terms
for every document; and a list with all the indexed terms.
"""
start_time = time.time()
# list of all indexed terms
indexed_terms = []
for i in range(len(docs_list)):
doc_tokens = Std_tokenizer.tokenize_nltk(docs_list[i].text)
docs_list[i].terms_list = doc_tokens # assign the terms of the Document i
for x in doc_tokens:
# update the indexed terms list
indexed_terms.append(x)
unique_words = list(set(indexed_terms)) # remove repeated terms by using set()
unique_words.sort() # sort to have a unique order when indexing
return docs_list, unique_words
def tokenize_nltk(doc_text):
text = doc_text.lower()
# remove numbers
text = text.translate(str.maketrans('', '', string.digits))
# remove punctuation (replace them with ' ')
text = text.translate(str.maketrans(Collection.punctuations(), ' '*len(Collection.punctuations())))
# generate tokens
word_tokens = word_tokenize(text)
# remove stopwords
filtered_text = [word for word in word_tokens if word not in Std_tokenizer.stop_words]
# lemmatize words in the list of tokenized words
lemmas = [Std_tokenizer.lemmatizer.lemmatize(word, pos ='v') for word in filtered_text]
return lemmas
def structure_data(docs_list, terms):
start_time = time.time()
# take the documents and the indexed terms to create the inverted terms, and matrix representations
docs_by_term_dict = Std_tokenizer.dictionary_strctr(terms, docs_list, start_time)
term_doc_matrix = Std_tokenizer.freq_m_strctr(terms, docs_list, docs_by_term_dict, start_time)
return docs_by_term_dict, term_doc_matrix
def dictionary_strctr(terms, docs_list, start_time):
# a dictionary term:list . For every term in the collection, the documents it's in
docs_by_term_dict = {t: [] for t in terms}
for i in range(len(docs_list)):
no_repeat = set(docs_list[i].terms_list)
#for every unique term in the document, add to dict[term] this document.id
for t in no_repeat:
docs_by_term_dict[t].append(docs_list[i].id)
return docs_by_term_dict
def freq_m_strctr(terms, docs_list, docs_by_term_dict, start_time):
""" A more efficient approach than exploring the whole matrix by rows and columns """
term_doc_matrix = np.ndarray((len(terms),len(docs_list)), dtype=int)
# fill the non-zero positions of the frequency matrix
for i in range(len(terms)):
for k in range(len(docs_by_term_dict[terms[i]])):
# only count the frequency of a term for the documents it's in
d_ind = docs_by_term_dict[terms[i]] [k] - 1 # minus 1, because the document's id is 1-indexed
freq = docs_list[d_ind].terms_list.count(terms[i])
term_doc_matrix[i,d_ind] = freq
return term_doc_matrix
def pickle_dump(path, docs_list, docs_by_term_dict, terms, term_doc_matrix):
# save the processed data to binary files, using pickle.
# this allow us to read later the file as a python structure
# we're not saving the frequency matrix because of its big weigth. (reaching 2GB)
# We rather save the lists and dictionaries and recreate the matrix in less than 10 seconds
dimensions_f = open(os.path.join(path, 'dimensions.bin'),'wb')
pickle.dump(term_doc_matrix.shape, dimensions_f)
dimensions_f.close()
docs_by_term_f = open(os.path.join(path, 'docs_by_term_dict.bin'),'wb')
pickle.dump(docs_by_term_dict, docs_by_term_f)
docs_by_term_f.close()
docs_f = open(os.path.join(path, 'docs_list.bin'),'wb')
pickle.dump(docs_list, docs_f)
docs_f.close()
unique_terms_f = open(os.path.join(path, 'unique_terms.bin'),'wb')
pickle.dump(terms, unique_terms_f)
unique_terms_f.close()
class Collection:
""" Abstract class Collection, with the common behavior of the collections"""
def punctuations(boolean=False):
#in the boolean model we don't remove '&!|' in the same way as in the other models
if boolean: return "\"#$%'()*+,-./:;<=>?@[]\\^_{`}~"
return "!\"#$%&'()*+,-./:;<=>?@[]\\^_`{|}~"
def load_files(self):
# try to load the saved data
path = self.save_path
try:
docs_f = open(os.path.join(path, 'docs_list.bin'),'rb')
self.documents_list = pickle.load(docs_f)
docs_f.close()
docs_by_term_f = open(os.path.join(path, 'docs_by_term_dict.bin'),'rb')
self.terms_dict = pickle.load(docs_by_term_f)
docs_by_term_f.close()
unique_terms_f = open(os.path.join(path, 'unique_terms.bin'),'rb')
self.indexed_terms = pickle.load(unique_terms_f)
unique_terms_f.close()
self.freq_matrix = Std_tokenizer.freq_m_strctr(self.indexed_terms, self.documents_list, self.terms_dict, time.time())
self.loaded_metadata = True
except:
self.loaded_metadata = False
def docs_ranking(self, ranking, docs_id_list):
# create a list with the information of the top ranking documents in docs_id_list
result = [] #tuple list
if not self.loaded_docs and not self.loaded_metadata:
self.load_docs()
docs_id_list = docs_id_list[:ranking]
for id in docs_id_list:
for doc in self.documents_list:
if id == doc.id:
result.append((doc.id, doc.title, doc.text))
break
return result[:ranking]
class Cranfield(Collection):
def __init__(self):
self.loaded_docs = False
self.loaded_queries = False
self.loaded_rel = False
self.loaded_metadata = False
self.reduced = 1400 # ammount of documents for the LSI
self.numb_docs = 1400
self.save_path = os.path.join(os.path.join(os.path.dirname(__file__),'data'), 'cranfield_treated')
self._generator = ir_datasets.load('cranfield')
def load_docs(self):
self.documents_list = []
i = 0
for doc in self._generator.docs_iter():
if i >= self.numb_docs: # for reduced collections
break
self.documents_list.append(Document(int(doc.doc_id), doc.text, doc.title))
i+=1
self.loaded_docs = True
def load_queries(self):
# a dictionary of Query
self.queries_dict = {}
# load queries
for query in self._generator.queries_iter():
self.queries_dict[int(query.query_id)] = Query(int(query.query_id), query.text)
self.loaded_queries = True
# load queries relevances
for qrel in self._generator.qrels_iter():
if int(qrel.doc_id) <= self.numb_docs:
q = self.queries_dict.get(int(qrel.query_id))
if q: #if the qrel.query_id is in self.queries_dict
q.docs_relevance.append(int(qrel.doc_id))
self.loaded_rel = True
def process_docs(self):
# tokenize and lemmatize the documents
if not self.loaded_docs:
self.load_docs()
self.documents_list , self.indexed_terms = Std_tokenizer.docs_process(self.documents_list)
self.terms_dict, self.freq_matrix = Std_tokenizer.structure_data(self.documents_list, self.indexed_terms)
# save to files
Std_tokenizer.pickle_dump(self.save_path, self.documents_list, self.terms_dict, self.indexed_terms, self.freq_matrix)
class Vaswani(Collection):
def __init__(self, reduce = False):
self.loaded_docs = False
self.loaded_queries = False
self.loaded_rel = False
self.loaded_metadata = False
if reduce: # reduce the dataset taking less documents, disminishing the matrix dimmensions
self.numb_docs = 9000
self.save_path = os.path.join(os.path.join(os.path.dirname(__file__),'data'), 'vaswani_r_treated')
else:
self.numb_docs = 11429
self.save_path = os.path.join(os.path.join(os.path.dirname(__file__),'data'), 'vaswani_treated')
self.reduced = 4000 # ammount of documents for the LSI, cause the matrix is still too big
self._generator = ir_datasets.load('vaswani')
def load_docs(self):
self.documents_list = []
i = 0
for doc in self._generator.docs_iter():
if i >= self.numb_docs: # if reduce = True
break
self.documents_list.append(Document(int(doc.doc_id), doc.text, doc.text[:60] + '...'))
i+=1
self.loaded_docs = True
def load_queries(self):
# a dictionary of Query
self.queries_dict = {}
# load queries
for query in self._generator.queries_iter():
self.queries_dict[int(query.query_id)] = Query(int(query.query_id), query.text)
self.loaded_queries = True
# load queries relevances
for qrel in self._generator.qrels_iter():
if int(qrel.doc_id) <= self.numb_docs:
q = self.queries_dict.get(int(qrel.query_id))
if q: # if the qrel.query_id is in self.queries_dict
q.docs_relevance.append(int(qrel.doc_id))
self.loaded_rel = True
def process_docs(self):
# tokenize and lemmatize the documents
if not self.loaded_docs:
self.load_docs()
self.documents_list , self.indexed_terms = Std_tokenizer.docs_process(self.documents_list)
self.terms_dict, self.freq_matrix = Std_tokenizer.structure_data(self.documents_list, self.indexed_terms)
# save to files
Std_tokenizer.pickle_dump(self.save_path, self.documents_list, self.terms_dict, self.indexed_terms, self.freq_matrix)
class Nfcorpus(Collection):
def __init__(self, reduce=False):
self.loaded_docs = False
self.loaded_queries = False
self.loaded_rel = False
self.loaded_metadata = False
if reduce: # reduce the dataset taking less documents, disminishing the matrix dimmensions
self.numb_docs = 4000
self.save_path = os.path.join(os.path.join(os.path.dirname(__file__),'data'), 'nfcorpus_r_treated')
else:
self.numb_docs = 5371
self.save_path = os.path.join(os.path.join(os.path.dirname(__file__),'data'), 'nfcorpus_treated')
self.reduced = 2500 # ammount of documents for the LSI, cause the matrix is still too big
self._generator = ir_datasets.load('nfcorpus/dev')
def load_docs(self):
self.documents_list = []
i = 0
for doc in self._generator.docs_iter():
if i >= self.numb_docs:
break
text_encode = doc.abstract.encode("ascii","replace")
text_decode = text_encode.decode()
title_encode = doc.title.encode("ascii","replace")
title_decode = title_encode.decode()
self.documents_list.append(Document(int(doc.doc_id[4:]), text_decode, title_decode))
i+=1
self.loaded_docs = True
def load_queries(self):
# a dictionary of Query
self.queries_dict = {}
# load queries
for query in self._generator.queries_iter():
#remove unicode characters
q_encode = (query.title.encode("ascii","replace")).decode()
self.queries_dict[int(query.query_id[6:])] = Query(int(query.query_id[6:]), q_encode)
self.loaded_queries = True
# load queries relevances
for qrel in self._generator.qrels_iter():
if int(qrel.doc_id[4:]) <= self.numb_docs:
q = self.queries_dict.get(int(qrel.query_id[6:]))
if q: # if the qrel.query_id is in self.queries_dict
q.docs_relevance.append(int(qrel.doc_id[4:]))
self.loaded_rel = True
def process_docs(self):
# tokenize and lemmatize the documents
if not self.loaded_docs:
self.load_docs()
self.documents_list , self.indexed_terms = Std_tokenizer.docs_process(self.documents_list)
self.terms_dict, self.freq_matrix = Std_tokenizer.structure_data(self.documents_list, self.indexed_terms)
# save to files
Std_tokenizer.pickle_dump(self.save_path, self.documents_list, self.terms_dict, self.indexed_terms, self.freq_matrix)
class Bool_node:
# boolean model query node, positive
def __init__(self, lex) -> None:
self.lex = str(lex)
def node_type(self):
return 'basic'
def node_value(self):
return self.lex
def __str__(self) -> str:
return self.lexs
class Bool_Not_node:
# boolean model query node, negated
def __init__(self, term):
self.lex = str(term)
def node_type(self):
return 'not'
def node_value(self):
return self.lex
def __str__(self) -> str:
return self.lex
datasets = {
'cranfield' : Cranfield(),
'cranfield_r' : Cranfield(),
'vaswani' : Vaswani(),
'vaswani_r' : Vaswani(reduce=True),
'nfcorpus' : Nfcorpus(),
'nfcorpus_r' : Nfcorpus(reduce=True)
}