-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseg.py
115 lines (108 loc) · 3.55 KB
/
seg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
import re
import numpy as np
from keras.layers import Dense, Embedding, LSTM, TimeDistributed, Input, Bidirectional
from keras.models import Model
from keras.utils import np_utils
word_size = 128
maxlen = 32
batch_size = 1024
s = open('seg-data/msr_train.txt').read()
s = s.split("\n")
count = 0
def clean(s): #整理一下数据,有些不规范的地方
if '“/s' not in s:
return s.replace(' ”/s', '')
elif '”/s' not in s:
return s.replace('“/s ', '')
elif '‘/s' not in s:
return s.replace(' ’/s', '')
elif '’/s' not in s:
return s.replace('‘/s ', '')
else:
return s
s = ''.join(map(clean, s))
s = re.split('[,。!?、]/[bems]', s)
#print(s[:10])
data = []
label = []
def get_xy(s):
s = re.findall('(.)/(.)', s)
if s:
s = np.array(s)
return list(s[:, 0]), list(s[:, 1])
for i in s:
x = get_xy(i)
if x:
data.append(x[0])
label.append(x[1])
d = pd.DataFrame(index=range(len(data)))
d['data'] = data
d['label'] = label
d = d[d['data'].apply(len) <= maxlen]
d.index = range(len(d))
tag = pd.Series({'s':0, 'b':1, 'm':2, 'e':3, 'x':4})
chars = []
for i in data:
chars.extend(i)
chars = pd.Series(chars).value_counts()
print(type(chars))
chars[:] = range(1, len(chars)+1)
d['x'] = d['data'].apply(lambda x: np.array(list(chars[x])+[0]*(maxlen-len(x))))
d['y'] = d['label'].apply(lambda x: np.array(list(map(lambda y:np_utils.to_categorical(y,5), tag[x].reshape((-1,1))))+[np.array([[0,0,0,0,1]])]*(maxlen-len(x))))
sequence = Input(shape=(maxlen,), dtype='int32')
embedded = Embedding(len(chars)+1, word_size, input_length=maxlen, mask_zero=True)(sequence)
blstm = Bidirectional(LSTM(64, return_sequences=True), merge_mode='sum')(embedded)
output = TimeDistributed(Dense(5, activation='softmax'))(blstm)
model = Model(input=sequence, output=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(np.array(list(d['x'])), np.array(list(d['y'])).reshape((-1,maxlen,5)), batch_size=batch_size, nb_epoch=50)
#转移概率,单纯用了等概率
zy = {'be':0.5,
'bm':0.5,
'eb':0.5,
'es':0.5,
'me':0.5,
'mm':0.5,
'sb':0.5,
'ss':0.5
}
zy = {i:np.log(zy[i]) for i in zy.keys()}
def viterbi(nodes):
paths = {'b':nodes[0]['b'], 's':nodes[0]['s']}
for l in range(1,len(nodes)):
paths_ = paths.copy()
paths = {}
for i in nodes[l].keys():
nows = {}
for j in paths_.keys():
if j[-1]+i in zy.keys():
nows[j+i]= paths_[j]+nodes[l][i]+zy[j[-1]+i]
k = np.argmax(nows.values())
paths[nows.keys()[k]] = nows.values()[k]
return paths.keys()[np.argmax(paths.values())]
def simple_cut(s):
if s:
r = model.predict(np.array([list(chars[list(s)].fillna(0).astype(int))+[0]*(maxlen-len(s))]), verbose=False)[0][:len(s)]
r = np.log(r)
nodes = [dict(zip(['s','b','m','e'], i[:4])) for i in r]
t = viterbi(nodes)
words = []
for i in range(len(s)):
if t[i] in ['s', 'b']:
words.append(s[i])
else:
words[-1] += s[i]
return words
else:
return []
not_cuts = re.compile('([\da-zA-Z ]+)|[。,、?!\.\?,!]')
def cut_word(s):
result = []
j = 0
for i in not_cuts.finditer(s):
result.extend(simple_cut(s[j:i.start()]))
result.append(s[i.start():i.end()])
j = i.end()
result.extend(simple_cut(s[j:]))
return result