forked from mit09m/Bolean-query-retrieval-system
-
Notifications
You must be signed in to change notification settings - Fork 0
/
boolean query.py
118 lines (108 loc) · 3.34 KB
/
boolean query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#Processes Boolean query as argument
#14095004 Aman Soni
#14095062 Shah Mit Paragbhai
#14095090 Shashwat Sinha
class MyStack:
def __init__(self):
self.container = []
def isEmpty(self):
return self.container() == []
def push(self, item):
self.container.append(item)
def pop(self):
return self.container.pop()
def top(self):
return self.container[len(self.container)-1]
import os, sys
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
def list_files(dir):
r = []
subdirs = [x[0] for x in os.walk(dir)]
for subdir in subdirs:
files = os.walk(subdir).next()[2]
if (len(files) > 0):
for file in files:
r.append(subdir + "/" + file)
return r
ps=PorterStemmer()
pth='/home/mit/Downloads/Infomation Retrieval/Assignment 2 indexing/20news-18828/alt.atheism'
fls=list_files(pth)
cnt=0
dic={}
for fl in fls:
f=open(fl,'r')
cnt+=1
tmp_dic={}
words=[]
for wrd in f:
#print wrd
words=wrd.split()
for wrd in words:
if wrd in tmp_dic:
tmp_dic[wrd]+=1
else:
tmp_dic[wrd]=1
f.close()
cur_file_name=(fl.split('/'))[-1]
for k in tmp_dic.keys():
if k in dic:
dic[k].append((tmp_dic[k],cur_file_name))
else:
dic[k]=[]
dic[k].append((tmp_dic[k],cur_file_name))
if(cnt%1000==0):
print cnt
ops = MyStack()
d=dic
#print d
s = MyStack()
list_1 = []
list_2 = []
list_u = []
list_d = []
p1 = []
p2 = []
for i in range(0, len(sys.argv)):
if sys.argv[i] == '(':
s.push('(')
continue
elif sys.argv[i] == ')':
temp = []
cnt=0
while not(s.top()=='('):
temp.append(s.top())
s.pop()
cnt+=1
s.pop()
temp_op=ops.top()
for i in range(0,cnt-1):
ops.pop()
temp_res=temp[0]
del(temp[0])
for t in temp:
if temp_op == 'AND':
temp_res=(list(set(temp_res) & set(t)))
#print list(set(list_1) & set(list_2))
#if len(ops)>0:
elif temp_op == 'OR':
temp_res=(list(set(temp_res) | set(t)))
elif temp_op == 'NOT':
temp_res=(list(set(t)-set(temp_res)))
s.push(temp_res)
elif sys.argv[i] == 'AND':
ops.push('AND')
continue
elif sys.argv[i] == 'OR':
ops.push('OR')
continue
elif sys.argv[i] == 'NOT':
ops.push('NOT')
continue
else :
if i==0:
continue
tmp_str=ps.stem(sys.argv[i])
s.push(map(list, zip(*d[tmp_str]))[1])
print s.container