-
Notifications
You must be signed in to change notification settings - Fork 45
/
classify.py
executable file
·96 lines (68 loc) · 2.57 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from __future__ import division
from mode import Mode
from db import Db
from words import text_to_list
class Classify(Mode):
MIN_WORD_COUNT = 5
RARE_WORD_PROB = 0.5
EXCLUSIVE_WORD_PROB = 0.99
def set_text(self, text):
words = text_to_list(text)
if not len(words):
raise ValueError('Text did not contain any valid words')
self.words = words
return self
def set_file_name(self, file_name):
try:
file_contents = open(file_name, 'r').read()
return self.set_text(file_contents)
except Exception as e:
raise ValueError('Unable to read specified file "%s", the error message was: %s' % (file_name, e))
def set_doctypes(self, doctype1, doctype2):
if doctype1 == doctype2:
raise ValueError('Please enter two different doctypes')
d = Db().get_doctype_counts()
if doctype1 not in d.keys():
raise ValueError('Unknown doctype: ' + doctype1)
if doctype2 not in d.keys():
raise ValueError('Unknown doctype: ' + doctype2)
self.doctype1 = doctype1
self.doctype2 = doctype2
def validate(self, args):
if len(args) != 5:
raise ValueError('Usage: %s classify <file> <doctype> <doctype>' % args[0])
self.set_file_name(args[2])
self.set_doctypes(args[3], args[4])
def p_for_word(self, db, word):
total_word_count = self.doctype1_word_count + self.doctype2_word_count
word_count_doctype1 = db.get_word_count(self.doctype1, word)
word_count_doctype2 = db.get_word_count(self.doctype2, word)
if word_count_doctype1 + word_count_doctype2 < self.MIN_WORD_COUNT:
return self.RARE_WORD_PROB
if word_count_doctype1 == 0:
return 1 - self.EXCLUSIVE_WORD_PROB
elif word_count_doctype2 == 0:
return self.EXCLUSIVE_WORD_PROB
# P(S|W) = P(W|S) / ( P(W|S) + P(W|H) )
p_ws = word_count_doctype1 / self.doctype1_word_count
p_wh = word_count_doctype2 / self.doctype2_word_count
return p_ws / (p_ws + p_wh)
def p_from_list(self, l):
p_product = reduce(lambda x,y: x*y, l)
p_inverse_product = reduce(lambda x,y: x*y, map(lambda x: 1-x, l))
return p_product / (p_product + p_inverse_product)
def execute(self):
pl = []
db = Db()
d = db.get_doctype_counts()
self.doctype1_count = d.get(self.doctype1)
self.doctype2_count = d.get(self.doctype2)
self.doctype1_word_count = db.get_words_count(self.doctype1)
self.doctype2_word_count = db.get_words_count(self.doctype2)
for word in self.words:
p = self.p_for_word(db, word)
pl.append(p)
result = self.p_from_list(pl)
return result
def output(self, result):
print 'Probability that document is %s rather than %s is %1.2f' % (self.doctype1, self.doctype2, result)