-
Notifications
You must be signed in to change notification settings - Fork 5
/
word_model2embeding_matrix.py
276 lines (240 loc) · 10.8 KB
/
word_model2embeding_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 22 13:33:52 2019
本文件的输入用于Keras中的Embedding函数使用
Embedding(nb_words, embedding_size, weights=[embedding_matrix], trainable=False)(inp)
@author: cjn
"""
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from gensim.models import KeyedVectors
import pickle
import pandas as pd
from data_preprocess import fetch_entites
def build_word2idx_embedMatrix(model):
''' 输入gensim训练的词向量模型,输出{'词语':下标}字典和嵌入矩阵
注意:word2idx中的下标对应与embedMatrix中的行,比如word2idx['国外']为991,
embedMatrix[991,:]为'国外'对应的词向量
Args:
model:经过gensim模型训练的结果
Returns:
word2idx:名词到index的映射的字典,比如 {'国外': 991,'网': 992, '贷网': 993, ...}
embedMatrix:每一行为一个单词,每一列为一个维度
'''
# 存储word2vec中所有向量的数组,留意其中多一位,词向量全为0,用于padding
embedMatrix = np.zeros((len(model.wv.vocab.keys()) + 1, model.vector_size))
# 构造词语到index的字典
word2idx = {} # 可以在word2idx中保留一个停词位,比如word2idx['_stop_word'] = 0
word2idx = dict(zip(model.wv.vocab.keys(), range(1, len(model.wv.vocab.keys())+1)))
# 构建
embedMatrix[1:,:] = model.wv.__getitem__(word2idx.keys())
return word2idx, embedMatrix
def build_word2idx_embedMatrix_2(model, entities_all):
''' 输入gensim训练的词向量模型,输出{'词语':下标}字典和嵌入矩阵
注意:word2idx中的下标对应与embedMatrix中的行,比如word2idx['国外']为991,
embedMatrix[991,:]为'国外'对应的词向量
Args:
model:经过gensim模型训练的结果
entities_all:所有句子以及标注的实体
Returns:
word2idx:名词到index的映射的字典,比如 {'国外': 991,'网': 992, '贷网': 993, ...}
embedMatrix:每一行为一个单词,每一列为一个维度
'''
word_set = set(model.vocab.keys())
word_interset = word_set.intersection(entities_all)
# 存储word2vec中所有向量的数组,留意其中多一位,词向量全为0,用于padding
embedMatrix = np.zeros((len(word_interset) + 1, model.vector_size))
# 构造词语到index的字典
word2idx = {} # 可以在word2idx中保留一个停词位,比如word2idx['_stop_word'] = 0
## 把index为0的留给padding
word2idx = dict(zip(word_interset, range(1, len(word_interset)+1)))
# 构建
embedMatrix[1:,:] = model.__getitem__(word2idx.keys())
return word2idx, embedMatrix
def make_deepLearn_data(sentenList, word2idx):
''' 将输入的句子tokenize,即将对应的单词标记为对应的下标,
比如一个句子为['如何', '安全', '提高', '最大化', '收益'],
标记为[1,2,3,4,5],如果这五个词的对应下标就是这五个数字的化,
Args:
sentenList:输入的句子向量,list组成的list,[['如何', '安全', '提高'] ['最大化', '收益']]
word2idx:词语到对应的标志的字典
Returns:
X_train_idx:将句子转化为token后的输入
'''
# 确定句子大最大长度
maxlen = 0
for i in sentenList:
if len(i)>maxlen:
maxlen = len(i)
X_train_idx = [[word2idx.get(w, 0) for w in sen] for sen in sentenList]
X_train_idx = np.array(pad_sequences(X_train_idx, maxlen, padding='post')) # 必须是np.array()类型
return X_train_idx, maxlen
def make_deepLearn_data_entity(w):
X_train_idx = word2idx.get(w, 0)
return X_train_idx
def split_word(txt, sep=' '):
if isinstance(txt, str):
result = txt[:-1].split(sep)
else:
result = []
return result
def split_word_entity(txt, sep=';'):
if isinstance(txt, str):
result = txt.split(sep)
else:
result = []
return result
def generate_token(x, word2idx):
result = []
for item in x:
result.append(word2idx.get(item, 0))
return result
def get_all_entities():
''' get all entities from sentence and entity field
Args:
entity_max_len:每个句子最多取几个词
'''
f = open('all_word_seg.txt','r', encoding = 'UTF-8')
sentences = f.readlines()
sentences = [item[:-1].split(' ') for item in sentences]
f.close()
entities_all = set()
for sen in sentences:
for item in sen:
entities_all.add(item)
# print(len(entities_all))
f = open('financial_entity_test.txt','r', encoding = 'UTF-8')
entities = f.readlines()
entities = [item[:-4].strip() for item in entities]
f.close()
entities = set(entities)
entities_all = entities_all.union(entities)
# print(len(entities_all))
f = open('financial_entity.txt','r', encoding = 'UTF-8')
entities = f.readlines()
entities = [item[:-4].strip() for item in entities]
f.close()
entities = set(entities)
entities_all = entities_all.union(entities)
# print(len(entities_all))
return entities_all
def entity_complement(train_file, test_file, output_file, embedMatrix, word2idx0):
''' 补全entity,将嵌入矩阵中未涉及的entity进行添加
Args:
train_file:训练集文件路径
test_file:测试集文件路径
output_file:输出文件路径
embedMatrix:嵌入矩阵,生成自 build_word2idx_embedMatrix_2
word2idx:Token字典,生成自build_word2idx_embedMatrix_2
'''
word2idx = word2idx0.copy()
data_train = pd.read_pickle(train_file)
data_test = pd.read_pickle(test_file)
# 获取无法识别的entity
entities = fetch_entites(data_train, ['entity','key_entity'], ';')
# print('length of entity in data train{}'.format(len(entities)))
entities = fetch_entites(data_test, ['entity',], ';')
# print('length of entity in data train{}'.format(len(entities)))
entity_miss_tmp = entities.difference(set(word2idx.keys()))
entity_miss = set()
for entity in entity_miss_tmp:
entity_miss.add(entity.strip())
if '' in entity_miss:
entity_miss.remove('')
# 将无法token的entity加入到embedding matrix中
start_index = len(word2idx.values()) + 1
new_word2idx = dict(zip(entity_miss, range(start_index, len(entity_miss)+start_index)))
# 更新word2idx
# print('before update word2idx {}'.format(len(word2idx.values())))
word2idx.update(new_word2idx)
# print('after update word2idx {}'.format(len(word2idx.values())))
# 更新embedding
new_embedding = np.zeros((len(new_word2idx.keys()), embedMatrix.shape[1]))
embedMatrix = np.row_stack((embedMatrix, new_embedding))
with open(output_file, 'wb') as f:
pickle.dump([word2idx, embedMatrix], f)
def generate_training_data(data_train_file, output_file, word2idx):
''' 生成tokenize后的数据
Args:
data_train_file:训练集文件
output_file:输出的tokenize后的文件
'''
data_train = pd.read_pickle(data_train_file)
x_train_txt0 = data_train.txt_split.apply(split_word)
X_train_txt, _ = make_deepLearn_data(x_train_txt0, word2idx)
x_train_title0 = data_train.title_split.apply(split_word)
X_train_title, _ = make_deepLearn_data(x_train_title0, word2idx)
x_entity = data_train.entity.apply(split_word_entity).apply(generate_token,args=(word2idx,))
y_train = data_train.negative.values
train_data = dict(zip(['txt', 'title', 'entity',
'y_train'],
[X_train_txt, X_train_title, x_entity.values,
y_train]))
with open(output_file, 'wb') as f:
pickle.dump(train_data, f)
shape_dic = {'txt_shape':X_train_txt.shape[1],
'title_shape':X_train_title.shape[1],}
return shape_dic
def generate_test_data(data_test_file, output_file, shape_dic, word2idx):
''' 生成tokenize后的数据
Args:
data_test_file:test set文件
output_file:输出的tokenize后的文件
shape_dic:由generate_training_data生成的txt,title的shape
'''
data_test = pd.read_pickle(data_test_file)
x_test_txt0 = data_test.txt_split.apply(split_word)
X_test_txt, _ = make_deepLearn_data(x_test_txt0, word2idx)
x_test_title0 = data_test.title_split.apply(split_word)
X_test_title, _ = make_deepLearn_data(x_test_title0, word2idx)
x_entity = data_test.entity.apply(split_word_entity).apply(generate_token,args=(word2idx,))
# 保证test set的padding长度 和train set一致
if shape_dic['txt_shape'] > X_test_txt.shape[1]:
X_test_txt = pad_sequences(X_test_txt, shape_dic['txt_shape'], padding='post')
else:
X_test_txt = X_test_txt[:,:shape_dic['txt_shape']]
if shape_dic['title_shape'] > X_test_title.shape[1]:
X_test_title = pad_sequences(X_test_title, shape_dic['title_shape'], padding='post')
else:
X_test_title = X_test_title[:,:shape_dic['title_shape']]
## ouput file
test_data = dict(zip(['txt', 'title', 'entity'],
[X_test_txt, X_test_title, x_entity.values]))
with open(output_file, 'wb') as f:
pickle.dump(test_data, f)
if __name__ == '__main__':
#%% 1.生成嵌入矩阵,单词的字典
print('load word2vec model...')
model = KeyedVectors.load_word2vec_format('train_vec_byTencent_word.bin', binary=True)
print('build embedding matrix...')
entities_all = get_all_entities()
word2idx, embedMatrix = build_word2idx_embedMatrix_2(model, entities_all)
#%% 2.增加entity的行, 输出嵌入矩阵
print('complemnet entity...')
train_file = 'Train_Data.pkl'
test_file = 'Test_Data.pkl'
output_file = 'word2idx_embedMatrix.pkl'
entity_complement(train_file, test_file, output_file, embedMatrix, word2idx)
train_file = 'Train_Data_hastitle.pkl'
test_file = 'Test_Data_hastitle.pkl'
output_file = 'word2idx_embedMatrix_hastitle.pkl'
entity_complement(train_file, test_file, output_file, embedMatrix, word2idx)
#%% 3.生成训练集和测试集
## no title
print('produce training set...')
data_train_file = 'Train_Data.pkl'
output_file = 'train_data_model.pkl'
shape_dic = generate_training_data(data_train_file, output_file, word2idx)
print('produce test set...')
data_test_file = 'Test_Data.pkl'
output_file = 'test_data_model.pkl'
generate_test_data(data_test_file, output_file, shape_dic, word2idx)
## have title
print('produce training set (has tille)...')
data_train_file = 'Train_Data_hastitle.pkl'
output_file = 'train_data_model_hastitle.pkl'
shape_dic = generate_training_data(data_train_file, output_file, word2idx)
print('produce test set (has tille)...')
data_test_file = 'Test_Data_hastitle.pkl'
output_file = 'test_data_model_hastitle.pkl'
generate_test_data(data_test_file, output_file, shape_dic, word2idx)