-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWord2vec.py
106 lines (82 loc) · 3.65 KB
/
Word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from gensim.models import Word2Vec
from util import *
@func_timer
def build_w2v(path, save=False, text='', load=False):
if load:
w2v = Word2Vec.load(path)
print(f"模型已从{path}加载")
return w2v
print('正在训练模型...')
w2v = Word2Vec(read_files(text, lst=True),
vector_size=400, window=7, min_count=2, workers=32, epochs=50, sg=1, hs=1, sample=1e-4, max_vocab_size=None)
if save:
w2v.save(path)
print(f"模型已保存到{path}")
return w2v
def find_synonym(model, word, top_n=5):
try:
return model.wv.most_similar(word, topn=top_n)
except KeyError:
return None
def visualize_words(model, words, perplexity=10, n_iter=1000, title='词向量可视化'):
word_vec = [model.wv[w] for w in words if w in model.wv]
if not word_vec:
print('模型中无匹配的词。')
return
word_vec = np.array(word_vec)
tsne = TSNE(perplexity=perplexity, n_components=2, init='pca', max_iter=n_iter, random_state=2)
embd = tsne.fit_transform(word_vec)
plt.figure(figsize=(16, 9))
for i, w in enumerate(words):
if w in model.wv:
plt.scatter(embd[i, 0], embd[i, 1])
plt.annotate(w, (embd[i, 0], embd[i, 1]))
plt.title(title)
plt.savefig(f'{title}.png')
plt.show()
def cluster_words(model, n_clusters=10, top_n=5000):
w = model.wv.index_to_key[:top_n]
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit([model.wv[wd] for wd in w])
word_cls = {}
for word, cl in zip(w, kmeans.labels_):
if cl not in word_cls:
word_cls[cl] = []
word_cls[cl].append(word)
return word_cls
def sentence2vec(model, sentence):
word_vec = [model.wv[c] for c in sentence if c in model.wv]
if not word_vec:
return None
return np.mean(word_vec, axis=0)
def sentence_sim(model, sentence1, sentence2):
vec1 = sentence2vec(model, sentence1)
vec2 = sentence2vec(model, sentence2)
if vec1 is None or vec2 is None:
return None
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
folder = 'all - slice'
model = build_w2v(f'w2v_model_{folder}.bin', text=f'ctext - {folder}', load=True)
while True:
word = input('请输入要查询的词语 (打"q"退出): ')
if word.lower() == 'q':
break
syn = find_synonym(model, word, top_n=15)
for w, sim in syn or []:
print(f'"{w}"\t相似度{sim:.6f}')
else:
print(f'{word}不在词汇表中') if not syn else None
# 词向量可视化(t-SNE)
visualize_words(model, ['州', '侯', '城', '郡', '縣', '晉', '楚', '宋', '鄭', '衛', '鳥', '獸', '鹿', '尾', '犬', '蟲', '蟻', '蠕', '蚊', '蛛',
'花', '頭', '香', '葉', '樓', '病', '痛', '熱', '腹', '脈', '鼓', '舞', '琴', '鍾', '鐘', '祭', '祝', '尸', '奠', '牲',
'之', '不', '以', '曰', '也', '年', '官', '軍', '州', '兵', '雲', '飛', '兮', '舟', '晴', '甘', '肉', '味', '米', '枚',
'旖', '旎', '膀', '胱', '箜', '篌', '邯', '鄲'])
# 语素聚类
for cl, w in cluster_words(model, n_clusters=50, top_n=10000).items():
print(f"聚类{cl}: {', '.join(w[:25])}")
# 句子相似度
s1 = de_p('關關雎鳩,在河之洲。窈窕淑女,君子好逑。')
s2 = de_p('學而時習之,不亦說乎?有朋自遠方來,不亦樂乎?人不知而不慍,不亦君子乎?')
print(f'句1\t{s1}')
print(f'句2\t{s2}')
print(f'相似度\t{sentence_sim(model, s1, s2): .6f}')