-
Notifications
You must be signed in to change notification settings - Fork 0
/
tfidf.py
138 lines (119 loc) · 4.17 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from collections import defaultdict
from multiprocessing import Pool
from bs4 import BeautifulSoup
import re
import sys
import os
import datetime
import math
import time
# Logging is disabled since it is not available
def extract_article_body(filename):
"""
Extracts the article body from the SEP article at the given filename. Some
error handling is done to guarantee that this function returns at least the
empty string. Check the error log.
"""
print(filename)
with open(filename) as f:
# Some files are not properly encoded in UTF-8, this ignores them
try:
doc = f.read()
except UnicodeDecodeError:
return ''
# ConvertEntities is not available in BeautifulSoup4 so it has been removed
soup = BeautifulSoup(doc)
# rip out bibliography
biblio_root = soup.findAll('h2', text='Bibliography')
if biblio_root:
biblio_root = biblio_root[-1].findParent('h2')
if biblio_root:
biblio = [biblio_root]
biblio.extend(biblio_root.findNextSiblings())
biblio = [elm.extract() for elm in biblio]
#else:
#logging.error('Could not extract bibliography from %s' % filename)
# grab modified body
body = soup.find("div", id="aueditable")
if body is not None:
# remove HTML escaped characters
body = re.sub("&\w+;", "", body.text)
return body
else:
#logging.error('Could not extract text from %s' % filename)
return ''
def tf(string):
"""
Takes the extracted article body as input and returns a defaultdict
containing each word in the article associated with its term frequency.
"""
total = 0
words = string.split()
wordcount = defaultdict(int)
for word in words:
wordcount[word] += 1
total += 1
for key,value in wordcount.items():
wordcount[key] = value/total
return wordcount
def idf(tfDictList):
"""
Takes a list of defaultdicts and finds the inverse document frequency
(idf) for every word in every article. Returns a new defaultdict
containing each word associated with its idf.
"""
numArticles = len(tfDictList)
articleCount = defaultdict(int)
for d in tfDictList:
for key,value in d.items():
articleCount[key] += 1
for key,value in articleCount.items():
articleCount[key] = math.log10(numArticles / value)
return articleCount
def tfidf(idfDict, tfDictList):
"""
Takes the defaultdict outputted by idf and the list of tf defaultdicts.
Returns a new list of defaultdicts containing word/tfidf associations for
each article.
"""
tfidfList = []
for article in tfDictList:
for key,value in article.items():
article[key] = idfDict[key] * value
tfidfList.append(article)
return tfidfList
def tfidfWordSum(dictlist):
"""
Takes the list of tfidf defaultdicts outputted by tfidf and returns a
single defaultdict of words associated with their summed tfidf values.
"""
count = defaultdict(int)
for d in dictlist:
for key,value in d.items():
count[key] += value
return count
if __name__ == '__main__':
startTime = time.clock()
entriesDir = sys.argv[-1]
dictionaryList = []
articles = []
for path, dirs, files in os.walk(entriesDir):
for f in files:
if f == "index.html":
filePath = path + "/" + f
data = extract_article_body(filePath)
articles.append(data)
tfResults = list(map(tf, articles))
idfResults = idf(tfResults)
tfidfList = tfidf(idfResults, tfResults)
tfidfSummed = tfidfWordSum(tfidfList)
# Timing
timestamp = str(datetime.datetime.now()) + "\n"
endTime = time.clock()
elapsedTime = str(endTime - startTime) + " seconds \n"
with open('tfidf_output.txt', 'a+') as f:
f.write(timestamp + elapsedTime)
f.write("---------------------------\n\n")
for key,value in sorted(tfidfSummed.items(), key=lambda x:x[1], reverse=True):
line = key + " , " + str(value) + "\n"
f.write(line)