-
Notifications
You must be signed in to change notification settings - Fork 8
/
loadICDM.py
140 lines (82 loc) · 3.76 KB
/
loadICDM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
sets up some basic functions and loads up useful data
assume run from Latent-Dirichlet-Allocation/ folder...
"""
import numpy as np
import scipy as sp
import os,sys
from liblda.low2corpus import Low2Corpus
# distances
from scipy.stats.distributions import entropy as spKLdiv
from liblda.math.distances import KLdiv, JSdiv
# Phi based mappings
from liblda.ILDA.hungarian_algorithm import getCostMatrix, find_closest
# Theta based mappings
from liblda.ILDA.hungarian_algorithm import getCostMatrix2, find_closest2
# data exploration, plotting and reporting
from liblda.topicviz.show_top import show_top
from liblda.topicviz.show_top import top_words_for_topic
import pylab as p
#####
##### MAIN SETTINGS FOR DATA SET
######################################################################
DATASET_NAME = "ArXiv16k"
print " LOADING DATA for: " + DATASET_NAME
DATA_PARENT_DIR="/CurrentPorjects/LatentDirichletAllocation/data/arXiv_as_LOW2/"
VOCAB_FILE = DATA_PARENT_DIR+"vocab.txt"
DOCS_FILE = DATA_PARENT_DIR+"arXiv_train_docs.txt"
IDS_FILE = DATA_PARENT_DIR+"arXiv_train_ids.txt"
######################################################################
# loaders....
# vocab, model and doc2id
train_corpus = Low2Corpus(DOCS_FILE)
train_corpus.setVocabFromList( [w.strip() for w in open(VOCAB_FILE, 'r').readlines() ] )
train_corpus.doCounts()
id_list = [w.strip() for w in open(IDS_FILE, 'r').readlines() ]
doc2id = dict( enumerate(id_list) )
phiT60_1 = np.load("../runs/repeatedT60-1/phi.npy")
thetaT60_1 = np.load("../runs/repeatedT60-1/theta.npy")
zT60_1 = np.load("../runs/repeatedT60-1/z.npy")
phiT60_2 = np.load("../runs/repeatedT60-2/phi.npy")
thetaT60_2 = np.load("../runs/repeatedT60-2/theta.npy")
zT60_2 = np.load("../runs/repeatedT60-2/z.npy")
phiT60_3 = np.load("../runs/repeatedT60-3/phi.npy")
thetaT60_3 = np.load("../runs/repeatedT60-3/theta.npy")
zT60_3 = np.load("../runs/repeatedT60-3/z.npy")
phiT60_4 = np.load("../runs/repeatedT60-4/phi.npy")
thetaT60_4 = np.load("../runs/repeatedT60-4/theta.npy")
zT60_4 = np.load("../runs/repeatedT60-4/z.npy")
# 5 6 7 8
phiT60_9 = np.load("../runs/repeatedT60-9/phi.npy")
thetaT60_9 = np.load("../runs/repeatedT60-9/theta.npy")
zT60_9 = np.load("../runs/repeatedT60-9/z.npy")
phi_orig = phiT60_1
theta_orig = thetaT60_1
z_orig = zT60_1
# The experimen where phiT60_1 had phiT60_2 -- to phiT60_8
# merged into it with 0 60 and 200 steps of Gibbs reamplings
# in between merging steps
phi_m0gibbs = np.load("../runs/new_merging_gibbs0/phi.npy")
theta_m0gibbs = np.load("../runs/new_merging_gibbs0/theta.npy")
z_m0gibbs = np.load("../runs/new_merging_gibbs0/z.npy")
phi_m60gibbs = np.load("../runs/new_merging_gibbs60/phi.npy")
theta_m60gibbs = np.load("../runs/new_merging_gibbs60/theta.npy")
z_m60gibbs = np.load("../runs/new_merging_gibbs60/z.npy")
phi_m200gibbs = np.load("../runs/new_merging_gibbs200/phi.npy")
theta_m200gibbs = np.load("../runs/new_merging_gibbs200/theta.npy")
z_m200gibbs = np.load("../runs/new_merging_gibbs200/z.npy")
# same as 0gibbs, but in the end we do a 200 iterations
phi_m0gibbs_f200 = np.load("../runs/new_merging_gibbs0_f200/phi.npy")
theta_m0gibbs_f200 = np.load("../runs/new_merging_gibbs0_f200/theta.npy")
z_m0gibbs_f200 = np.load("../runs/new_merging_gibbs0_f200/z.npy")
# we want to test whether it is the Gibbs steps that is undoing
# the topic coherence that was done by the merging steps
# one w/ 200 for fun
phiT200 = np.load("../runs/subtopicsT200unseeded/phi.npy")
thetaT200 = np.load("../runs/subtopicsT200unseeded/theta.npy")
zT200 = np.load("../runs/subtopicsT200unseeded/z.npy")
print """
you might want to run these commands:
%run liblda/ILDA/doc_likelyhood_plots.py
"""
#seeded_m2 = find_closest2(thetacut, seeded_theta)