-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDependencyTool.py
120 lines (105 loc) · 3.35 KB
/
DependencyTool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from nltk.corpus import stopwords
import itertools
class DependencyTool:
def __init__(self):
self.total = {'unfiltered': 0, 'filtered': 0, 'lossy-unfiltered': 0}
self.counts = {'unfiltered': {}, 'filtered': {}, 'lossy-unfiltered': {}}
self.trigram_counts = {}
self.bigram_counts = {}
self.stop = stopwords.words('english')
def _put(self, dependency, group_name):
'''
adds a dependency to a specific group
'''
self.total[group_name] = self.total[group_name] + 1
group = self.counts[group_name]
if dependency in group:
group[dependency] = group[dependency] + 1
else:
group[dependency] = 1
def put(self, dependency):
'''
adds a dependency
'''
self._put(dependency, 'unfiltered')
self._put((dependency[0][0], dependency[1], dependency[2][0]), 'lossy-unfiltered')
if not self.contains_stopwords(dependency):
self._put(dependency, 'filtered')
def put_all(self, dependencies):
'''
adds an iterable object of dependencies
'''
for dependency in dependencies:
self.put(dependency)
def dependencies(self, group_name):
'''
returns a list of dependencies within a group
'''
return self.counts[group_name].keys()
def count(self, dependency, group_name):
'''
returns the number of occurences of a dependency within a group
'''
group = self.counts[group_name]
if dependency in group:
return group[dependency]
else:
return 0
def train_trigram(self, first, second, third):
'''
adds a trigram to the training set for the trigram model
format:
each item = (morph_word, dependency_type)
'''
bi_key = (first, second)
tri_key = (first, second, third)
bi_count = 1
tri_count = 1
if tri_key in self.trigram_counts:
tri_count = tri_count + self.trigram_counts[tri_key]
self.trigram_counts[tri_key] = tri_count
if bi_key in self.bigram_counts:
bi_count = bi_count + self.bigram_counts[bi_key]
self.bigram_counts[bi_key] = bi_count
def trigram_probability(self, first, second, third):
'''
returns the probability of a trigram
format:
each item = (morph_word, dependency_type)
'''
bi_key = (first, second)
tri_key = (first, second, third)
if bi_key in self.bigram_counts and tri_key in self.trigram_counts:
bi_count = self.bigram_counts[bi_key]
tri_count = self.trigram_counts[tri_key]
probability = tri_count / float(bi_count)
else:
probability = float(0)
return probability
def frequency(self, dependency, group_name='filtered'):
'''
returns the frequency of a certain dependency within a given group
additive smoothing
'''
group = self.counts[group_name]
total = float(self.total[group_name])
if dependency in group:
return (group[dependency] + 1) / total
else:
return 1 / total
def contains_stopwords(self, dependency):
'''
returns boolean if the dependency contains stopwords
'''
if dependency[0][0] in self.stop or dependency[2][0] in self.stop:
return True
else:
return False
def dependency_frequencies(self, group_name):
'''
returns a map of all dependecncy frequencies within a given group
'''
freq = {}
for dependency in self.counts:
freq[dependency] = self.frequency(dependency, group_name)
return freq