-
Notifications
You must be signed in to change notification settings - Fork 0
/
simple.py
executable file
·68 lines (56 loc) · 1.85 KB
/
simple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/python
import argparse
from collections import defaultdict
import math
import os
import pickle
import re
import sys
from sklearn.metrics.pairwise import cosine_similarity
from itertools import tee, izip
def pairwise(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = tee(iterable)
next(b, None)
return izip(a, b)
def get_bigrams(filename):
bigrams = defaultdict(int)
file = open(filename, "r+")
words = file.read().split()
for v, w in pairwise(words):
bigrams[(v, w)] += 1
return bigrams
def get_features(dirs):
file_bigrams = defaultdict(lambda: defaultdict(int))
for d in dirs:
for root, _, files in os.walk(d):
for f in files:
file_bigrams[f] = get_bigrams(os.path.join(root, f))
return file_bigrams
def calculate_bigram_similarity(a, b):
val = 0
for bg, count in a.iteritems():
if bg in b and count and b[bg]:
#val += math.log(count + b[bg])
val += 1
return val
def find_similar(test_bigrams, corpus):
max = ""
max_val = 0
for name, bgs in corpus.iteritems():
val = calculate_bigram_similarity(test_bigrams, bgs)
#print " - evaluating %s: %s" % (name, val)
if val > max_val:
max_val = val
max = name
return max, max_val
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dir", type=str, action="append", default=[])
parser.add_argument("-t", "--test", type=str, action="append", default=[])
args = parser.parse_args()
corpus = get_features(args.dir)
test_features = get_features(args.test)
for test, bigrams in test_features.iteritems():
name, value = find_similar(bigrams, corpus)
print "Most similar for %s (%s bigrams): %s (%s)" % (test, len(bigrams), name, value)