-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_word_token_dict.py
48 lines (41 loc) · 1.09 KB
/
build_word_token_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import cPickle
import cStringIO
import sys
NCOUNTLIMIT = 5 #prune words with less than NCOUNTLIMIT occurences
NMAXSTRINGLEN = 17 #prune words shorter than NMAXSTRINGLEN
def create_token_dict(f):
"""
Input: word per line
Output: dictionary mapping words to the number of occurences
Note: easy way to make this faster is to take a sample of input words, saves memory too
"""
d = {}
i = 0
#count different words
for t in f:
t = t.strip()
#if t.isdigit(): #limit to digits only
if len(t) < NMAXSTRINGLEN:
d[t] = d.setdefault(t,0)+1
#prune words with less than NCOUNTLIMIT occurences
for k in d.keys():
if d[k] < NCOUNTLIMIT:
del d[k]
return d
def usage():
print "cat one_word_per_line | %s token_counts.pickle"
def parse_cp():
if len(sys.argv) != 2:
usage()
sys.exit(-1)
return (sys.argv[1])
def main():
outfn = parse_cp()
inf = sys.stdin
d = create_token_dict(inf)
outf = open(outfn,"w")
cPickle.dump(d,outf, protocol=cPickle.HIGHEST_PROTOCOL)
outf.close()
print "wrote token dict to file %s" %outfn
if __name__ == "__main__":
main()