-
Notifications
You must be signed in to change notification settings - Fork 40
/
demo.py
55 lines (45 loc) · 1.67 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#encoding=utf-8
from zhcnSegment import *
from fileObject import FileObj
from sentenceSimilarity import SentenceSimilarity
from sentence import Sentence
if __name__ == '__main__':
# 读入训练集
file_obj = FileObj(r"testSet/trainSet.txt")
train_sentences = file_obj.read_lines()
# 读入测试集1
file_obj = FileObj(r"testSet/testSet1.txt")
test1_sentences = file_obj.read_lines()
# 读入测试集2
file_obj = FileObj(r"testSet/testSet2.txt")
test2_sentences = file_obj.read_lines()
# 分词工具,基于jieba分词,我自己加了一次封装,主要是去除停用词
seg = Seg()
# 训练模型
ss = SentenceSimilarity(seg)
ss.set_sentences(train_sentences)
ss.TfidfModel() # tfidf模型
# ss.LsiModel() # lsi模型
# ss.LdaModel() # lda模型
# 测试集1
right_count = 0
for i in range(0,len(train_sentences)):
sentence = ss.similarity(test1_sentences[i])
if i != sentence.id:
print str(i) + " wrong! score: " + str(sentence.score)
else:
right_count += 1
print str(i) + " right! score: " + str(sentence.score)
print "正确率为: " + str(float(right_count)/len(train_sentences))
# 测试集2
# right_count = 0
# for i in range(0,len(train_sentences)):
# sentence = ss.similarity(test2_sentences[i])
#
# if i != sentence.id:
# print str(i) + " wrong! score: " + str(sentence.score)
# else:
# right_count += 1
# print str(i) + " right! score: " + str(sentence.score)
#
# print "正确率为: " + str(float(right_count)/len(train_sentences))