forked from yoconana/Information-Retrieval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
QueryWithTF.py
62 lines (49 loc) · 1.45 KB
/
QueryWithTF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# Xi Chen
# 3/16/2016
# Let user input a query and then do tokenize, remove stop word and stemming on the string
# Also output the tf list for query
from lxml import html
from lxml.html.clean import clean_html
import string
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import os
import codecs # otherwise, it has ascii encoding error
import preprocess
import inverted_index
def outputStringQuery(query):
ps = PorterStemmer()
#tokenize query
tokenized_word = word_tokenize(query)
#remove stop word
stop_words = set(stopwords.words("english"))
removedstop_word = []
removedstop_word = [w for w in tokenized_word if not w in stop_words]
#stemming query
stemmed_words = []
for w in removedstop_word:
try:
stemmed_words.append(str(ps.stem(w)))
except UnicodeDecodeError:
print w
outcome = ' '.join(stemmed_words)
#return a list instead of returning str
outcome = outcome.split()
return outcome
def extractValuePart(query):
values=[]
for a,b in query:
values.append(b)
return values
"""
## the query terms after preprocess
query = raw_input('Enter your query:')
query = preprocess.cleanquery(query)
query = outputStringQuery(query)
## output a vectors list contain the tf values for query (0 or 1)
querylist = query.split()
queryTF = inverted_index.tf_of_query(querylist)
#print query
"""