-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathTFIDF.py
38 lines (34 loc) · 932 Bytes
/
TFIDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import math
import nltk as nltk
__author__ = 'YYT'
def ComputeFreq(wordlist, text):
result = []
for word in wordlist:
countword = text.count(word)
texted = nltk.word_tokenize(text)
length = len(texted)
freq = countword/length
temp = {}
temp['word'] = word
temp['freq'] = freq
#print freq
result.append(temp)
return result
def Computetfidf(wordfreq, corpus):
result = []
for item in wordfreq:
word = item['word']
tf = item['freq']
dlength = len(corpus)
count = 1
for line in corpus:
if line.find(word)!=-1:
count = count+1
idf = math.log10(dlength/count)
tfidf = tf * idf
temp = {}
temp['word'] = word
temp['tfidf'] = tfidf
result.append(temp)
result.sort(lambda x,y : -cmp(x['tfidf'], y['tfidf']))
return result