-
Notifications
You must be signed in to change notification settings - Fork 0
/
ProcessData.py
45 lines (38 loc) · 1.15 KB
/
ProcessData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""
对cvs格式的文件进行处理
返回分词好的结果
"""
import jieba
import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
import warnings
warnings.filterwarnings('ignore')
#分割数据集,70%做训练集,30%做测试集
train_rate = 0.7
#y值(训练集)
train_y = np.concatenate((np.ones(1000),np.zeros(1000)))
np.save('train_y.npy',train_y)
#x值(训练集)
pd_x = pd.read_csv('take-out.csv',encoding='gbk',usecols=['review'])
#分词处理
list_x = []
for i in range(0,2000):
str = pd_x.loc[i][0]
words = jieba.lcut(str)
list_x.append(words)
#训练word2vec模型(求词向量)
#因为样本数不多,所以特征的维度也小一点 100
model = Word2Vec(list_x,sg=1,vector_size=100,window=5,min_count=2,negative=3,sample=0.001,hs=1,workers=4)
model.save("word2vec_model.pkl")
#求句向量
def sum_vec(text):
vec = np.zeros(100).reshape((1,100))
for word in text:
try:
vec += model.wv[word].reshape((1,100))
except KeyError:
continue
return vec
train_vec = np.concatenate([sum_vec(z) for z in list_x])
np.save('train_x_vec.npy',train_vec)