-
Notifications
You must be signed in to change notification settings - Fork 8
/
loadSeededUnseeded.py
93 lines (50 loc) · 2.06 KB
/
loadSeededUnseeded.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""
sets up some basic functions and loads up useful data
assume run from Latent-Dirichlet-Allocation/ folder...
"""
import numpy as np
import scipy as sp
import os,sys
from liblda.low2corpus import Low2Corpus
# distances
from scipy.stats.distributions import entropy as spKLdiv
from liblda.math.distances import KLdiv, JSdiv
# Phi based mappings
from liblda.subtopics.hungarian_algorithm import getCostMatrix, find_closest
# Theta based mappings
from liblda.subtopics.hungarian_algorithm import getCostMatrix2, find_closest2
# data exploration, plotting and reporting
from liblda.topicviz.show_top import show_top
from liblda.topicviz.show_top import top_words_for_topic
import pylab as p
#####
##### MAIN SETTINGS FOR DATA SET
######################################################################
DATASET_NAME = "ArXiv16k"
print " LOADING DATA for: " + DATASET_NAME
DATA_PARENT_DIR="/CurrentPorjects/LatentDirichletAllocation/data/arXiv_as_LOW2/"
VOCAB_FILE = DATA_PARENT_DIR+"vocab.txt"
DOCS_FILE = DATA_PARENT_DIR+"arXiv_train_docs.txt"
IDS_FILE = DATA_PARENT_DIR+"arXiv_train_ids.txt"
######################################################################
# loaders....
# vocab, model and doc2id
tcorpus3 = Low2Corpus(DOCS_FILE)
tcorpus3.setVocabFromList( [w.strip() for w in open(VOCAB_FILE, 'r').readlines() ] )
tcorpus3.doCounts()
id_list = [w.strip() for w in open(IDS_FILE, 'r').readlines() ]
doc2id = dict( enumerate(id_list) )
# data
phi = np.load("../runs/subtopicsT40/phi.npy")
#seeded_phi = np.load("../runs/subtopicsT200seeded/phi.npy")
unseeded_phi = np.load("../runs/subtopicsT200unseeded/phi.npy")
theta = np.load("../runs/subtopicsT40/theta.npy")
#seeded_theta = np.load("../runs/subtopicsT200seeded/theta.npy")
unseeded_theta = np.load("../runs/subtopicsT200unseeded/theta.npy")
#p.clf(); p.plot(unseeded_theta[2000:3000,[75,61,15]]); p.ylim([0,0.03])
#hist_of_topics_in_docs(seeded_theta)
print """
you might want to run these commands:
%run liblda/ILDA/doc_likelyhood_plots.py
"""
#seeded_m2 = find_closest2(thetacut, seeded_theta)