-
Notifications
You must be signed in to change notification settings - Fork 2
/
ced_alignment.py
97 lines (67 loc) · 2.57 KB
/
ced_alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from collections import deque
import editdistance
from utils import norm_pnx_nums
def _edit_distance(tokens1, tokens2, weight_fns):
tbl = {}
tbl[(0, 0)] = (0, 'n')
m = len(tokens1)
n = len(tokens2)
for i in range(0, m):
tbl[(i + 1, 0)] = (i + 1, 'd')
for j in range(0, n):
tbl[(0, j + 1)] = (j + 1, 'i')
if m == 0 or n == 0:
return tbl
for i in range(0, m):
for j in range(0, n):
if (tokens1[i] == tokens2[j]):
edit_cost = tbl[(i + 1, j + 1)] = (tbl[(i, j)][0], 'n')
else:
edit_cost = (tbl[(i, j)][0] + weight_fns['s'](tokens1[i], tokens2[j]), 's')
insert_cost = (tbl[(i, j + 1)][0] + weight_fns['d'](tokens1[i]), 'd')
delete_cost = (tbl[(i + 1, j)][0] + weight_fns['i'](tokens2[j]), 'i')
tbl[(i + 1, j + 1)] = min([insert_cost, delete_cost, edit_cost], key = lambda t: t[0])
return tbl
def _gen_alignments(tokens1, tokens2, orig_tokens1, orig_tokens2):
weight_fns = {
's': lambda x, y: editdistance.eval(x, y) * 2 / max(len(x), len (y)),
'd': lambda x: 1,
'i': lambda x: 1
}
dist_table = _edit_distance(tokens1, tokens2, weight_fns)
m = len(tokens1)
n = len(tokens2)
alignments = deque()
i = m
j = n
while i != 0 or j != 0:
op = dist_table[(i, j)][1]
cost = dist_table[(i, j)][0]
if op == 'n' or op == 's':
_op = 'KEEP' if op == 'n' else 'REPLACE'
orig_token1, orig_token2 = '', ''
if orig_tokens1[i - 1] != tokens1[i - 1]:
orig_token1 = orig_tokens1[i - 1]
if orig_tokens2[j - 1] != tokens2[j - 1]:
orig_token2 = orig_tokens2[j - 1]
alignments.appendleft((i, j, tokens1[i - 1], tokens2[j - 1], _op, orig_token1,
orig_token2))
i -= 1
j -= 1
elif op == 'i':
alignments.appendleft(('None', j, tokens2[j - 1], 'INSERT', '', orig_tokens2[j - 1]))
j -= 1
elif op == 'd':
alignments.appendleft((i, 'None', tokens1[i - 1], 'DELETE', orig_tokens1[i - 1], ''))
i -= 1
return alignments
def align_words(s1, s2):
norm_s1 = norm_pnx_nums(s1)
norm_s2 = norm_pnx_nums(s2)
s1_tokens = s1.split()
s2_tokens = s2.split()
norm_s1_tokens = norm_s1.split()
norm_s2_tokens = norm_s2.split()
alignments = _gen_alignments(norm_s1_tokens, norm_s2_tokens, s1_tokens,
s2_tokens)
return list(alignments)