-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_helper.py
220 lines (185 loc) · 6.64 KB
/
data_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# @File : data_helper.py
# @Author: Cuiqingyao
# @Date : 18-11-29
# @Desc :
# @Contact: [email protected]
import math
from util import *
from settings import *
class DataHelper(object):
'''
数据预处理
'''
def __init__(self, origin_corpus=ORIGIN_CORPUS, word_dict_file=WORD_DICT_FILE):
# 原始语料
self.origin_corpus = origin_corpus
# 字典
self.word_dict_file = word_dict_file
def __str__(self):
return 'datahelper -- origin_corpus :{0}' % (self.origin_corpus)
def __build_dict(self):
'''
构建字典
:return:
'''
try:
if os.path.exists(TRAINING_FILE):
word_dict = {}
training_data = self.__load(type='txt', data_file=TRAINING_FILE)
for data in training_data:
for word in data:
self.__update_count(d=word_dict, key=word)
self.__save(data=word_dict, type='json', data_file=self.word_dict_file)
# 并统计最大词长,word_type数量, token数据
# 最大词长
self.__max_word_length(word_dict=word_dict)
# word types
self.__number_of_word_types(word_dict=word_dict)
# tokens
self.__number_of_tokens(training_data=training_data)
# 基于全语料库的构建词典
if os.path.exists(TRAINING_FILE_FOR_BIGRAM):
word_dict_bigram = {}
bi_gram_data = self.__load(type='txt', data_file=TRAINING_FILE_FOR_BIGRAM)
for data in bi_gram_data:
for word in data:
self.__update_count(d=word_dict_bigram, key=word)
self.__save(data=word_dict_bigram, type='json', data_file=WORD_DICT_FILE_FOR_BIGRAM)
except Exception as e:
print(e)
print("程序正在推出...")
exit()
def __save(self, data, type, data_file):
'''
保存数据
:param data:
:param type: 数据格式
:param data_file:
:return:
'''
if type == 'json':
write_to_json(json_data=data, data_file=data_file)
elif type == 'txt':
write_to_txt(txt_data=data, data_file=data_file)
else:
raise Exception("不支持的格式%s!" % (type))
def __load(self, type, data_file):
'''
加载数据
:param type: 数据格式
:param data_file:
:return:
'''
if type == 'json':
return read_from_json(data_file=data_file)
elif type == 'txt':
return read_from_txt(data_file=data_file)
else:
raise Exception("不支持的格式%s!" % (type))
def __clean_corpus(self):
'''
语料清洗,去除空行,去除每一行的日期
:return: 干净语料
'''
clean_data = []
if os.path.exists(self.origin_corpus):
with open(self.origin_corpus, 'r', encoding='utf8') as f_in:
for line in f_in:
# 对于不是空行,添加句子头标记和尾标记,并去除词性,和日期
if len(line.split()) != 0:
data = []
pairs = re.split(r'\s+', line.strip())
for pair in pairs[1:]:
token, _ = pair.split('/')
data.append(token)
clean_data.append(data)
else:
raise Exception('原始语料不存在!')
return clean_data
def __update_count(self, d, key):
'''
更新字典值,每次加1
:param d:
:param key:
:return:
'''
v = d.setdefault(key, 0)
d[key] = v + 1
def prcessed(self):
'''
数据处理接口
:return:
'''
if os.path.exists(TRAINING_FILE) and \
os.path.exists(TEST_FILE) and \
os.path.exists(WORD_DICT_FILE) and\
os.path.exists(TRAINING_FILE_FOR_BIGRAM):
print('数据已经处理完毕...')
else:
print('数据预处理 ... ')
print('生成训练数据和测试数据 ...')
print('training data size : test data size = %.1f : %.1f' % (TRAINING, TEST))
# 生成训练数据 测试数据 分词答案
self.__generate_train_test_data()
# 针对训练数据生成字典
self.__build_dict()
print("数据生成完成 ...")
def __max_word_length(self, word_dict):
'''
计算最大词长
:return:
'''
max_len = 0
for word in word_dict:
if max_len < len(word):
max_len = len(word)
print('max word length :', max_len)
def __number_of_word_types(self, word_dict):
'''
计算word type的数量
:return:
'''
print('number of word types :', len(word_dict))
def __number_of_tokens(self, training_data):
'''
计算token的数量
:param training_data:
:return:
'''
tokens_num = 0
for data in training_data:
tokens_num += len(data)
print('number of tokens :', tokens_num)
def __generate_train_test_data(self):
'''
生成训练数据和测试数据
:return:
'''
assert TRAINING < 1.0 and TEST < 1.0 and (TRAINING + TEST) == 1.0
try:
clean_data = self.__clean_corpus()
training_size = math.ceil(TRAINING * len(clean_data))
training_data = clean_data[0:training_size]
answers = clean_data[training_size:]
# 保存训练数据
self.__save(data=training_data, type='txt', data_file=TRAINING_FILE)
# 保存分词答案
self.__save(data=answers, type='txt', data_file=ANSWERS)
test_data = []
for answer in answers:
sentence = ''
for word in answer:
sentence += word
test_data.append(sentence)
write_sentences(txt_data=test_data, data_file=TEST_FILE)
# 生成整个语料数据用于训练bigram
for data in clean_data:
data.insert(0, '<B>')
self.__save(data=clean_data, type='txt', data_file=TRAINING_FILE_FOR_BIGRAM)
except Exception as e:
print(e)
print("程序正在推出...")
exit()
if __name__ == '__main__':
datahelper = DataHelper()
datahelper.prcessed()