-
Notifications
You must be signed in to change notification settings - Fork 0
/
qlda.py
133 lines (124 loc) · 4.41 KB
/
qlda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import numpy as np
import lda
import sqlite3
import json
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD
from vocab import Vocabulary
import MySQLdb as mdb
import sys
import random
import cPickle as pickle
from scipy.sparse import lil_matrix, csr_matrix
from path import Path
from argparse import ArgumentParser
def qbquery(num):
con = mdb.connect(host='localhost', user='vasu', db='quizbowl', passwd='seleniumpython');
with con:
cur = con.cursor()
cur.execute('use quizbowl')
cur.execute('SELECT question FROM tossup')
clues = cur.fetchall()
clues = [c[0].decode('latin-1', 'ignore') for c in clues]
return clues
def jeoquery(num):
conn = sqlite3.connect("clues.db")
c = conn.cursor()
c.execute("SELECT clue FROM documents")
clues = c.fetchall()
clues = [ci[0].encode('ascii', 'ignore') for ci in clues]
return clues
argparser = ArgumentParser()
argparser.add_argument('--qtype1', default="jeo")
argparser.add_argument('--qtype2', default="qb")
argparser.add_argument('--foldername', default="combined_both")
argparser.add_argument('--inputname', default="jeodata")
argparser.add_argument('--inputname2', default="qbdata")
argparser.add_argument('--inputname3', default="combined_jeo_small")
argparser.add_argument('--inputname4', default="ldatest")
argparser.add_argument('--inputname5', default="../qparser")
args = argparser.parse_args()
inputpath = Path(args.inputname)
inputpath2 = Path(args.inputname2)
inputpath3 = Path(args.inputname3)
inputpath4 = Path(args.inputname4)
inputpath5 = Path(args.inputname5)
#if args.qtype == "jeo":
#clues = jeoquery(20000)
#else:
#clues = qbquery(20000)
clues1 = pickle.load(open(inputpath / "clues.pkl", "rb"))
clues2 = pickle.load(open(inputpath2 / "clues.pkl", "rb"))
clues4 = pickle.load(open(inputpath5 / "ocred.pkl", "rb"))
diff = len(clues1) / len(clues2)
#clues2 = clues2 * diff + clues2[:(len(clues1) - len(clues2 * diff))]
print len(clues1), len(clues2)
clues1 = clues1[:len(clues4)]
clues2 = clues2[:len(clues4)]
print len(clues1), len(clues2), len(clues4)
oldclues = clues2 + clues4
clues = clues2 + clues4
#vocab = Vocabulary()
#clues = []
#for i in oldclues:
#if i not in clues:
#clues.append(i)
#for clue in oldclues:
#vocab.add_question(clue)
path = Path(args.foldername)
if not path.exists():
path.mkdir()
vocab = Vocabulary.load(inputpath3 / "vocab.pkl")
vocab.save(path / "vocab.pkl")
print vocab.number
matrix = lil_matrix((len(clues), vocab.number + 1))
matrix = matrix.astype(np.int64)
i = 0
for clue in tqdm(clues):
vector = vocab.translate(clue)
matrix[i] = vector
i += 1
print matrix.shape
matrix = csr_matrix(matrix)
#np.save(path / "matrix.npy", matrix)
#matrix = pickle.load(open(inputpath4 / "matrix.pkl", "rb"))
#print matrix.shape
#idx = np.arange(matrix.shape[1])
#print idx.shape
#with open("docformat.txt", "w") as f:
#for i in xrange(matrix.shape[0]):
#row = matrix[i].todense()
#print row.shape
#word_idx = row > 0
#words = idx[word_idx]
#unique = words.shape[0]
#counts = zip(words, row[word_idx])
#print >> f, "%d %s" % (unique,
#' '.join(
#["%d:%d" % (t, c) for t, c in counts]
#))
#svd = TruncatedSVD(n_components=700)
#docs = svd.fit_transform(matrix)
model = lda.LDA(n_topics=10, n_iter=10000)
docs = model.fit_transform(matrix)
top_words = 8
with open(path / "topicslist.txt", "w") as f:
f.write("Combined List of 7 Topics \n\n")
for i, topic_dist in enumerate(model.topic_word_):
words = [vocab.number_mapping[w] for w in topic_dist.argsort()[::-1][:top_words]]
with open(path / "topicslist.txt", "a") as f:
f.write("Topic %u: %s" % (i, ', '.join(words)) + "\n")
print "Topic %u: %s" % (i, ', '.join(words))
np.save(path / "docs.npy", docs)
np.save(path / "topics.npy", model.topic_word_)
pickle.dump(clues, open(path / "clues.pkl", "wb"))
#with open(path / "qbtopics.txt", "w") as f:
#f.write("Jeopardy Topics and Their Questions \n")
#for i in range(7):
#with open(path / "qbtopics.txt", "a") as f2:
#f2.write("Topic %d: \n\n" % i)
#questions = np.argsort(docs.transpose()[i])[::-1][:10]
#for question in questions:
#output = clues[question].encode("ascii", "ignore") + "\n\n"
#with open(path / "qbtopics.txt", "a") as f3:
#f3.write(output)