forked from jinyyy666/AR_Miner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAR_textrank.py
147 lines (125 loc) · 4.01 KB
/
AR_textrank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/python
#
# The use the text rank to take care of the review instance level ranking
#
#
# Author: Yingyezhe Jin; Date: Apr. 21, 2017
# python imports:
import numpy as np
try:
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
except:
print("Please install the module 'scipy' for sparse matrix!")
print("pip install scipy")
sys.exit(-1)
# AR Miner imports:
from AR_util import sim
# Input:
# doc_topic : lda topic model matrix n*k, n : number of reviews, k: # of topics
# informRev : all informative reviews
# Output:
# rankedInstance : a dict = {topic, list of sorted reviews}
# ##rankedGroup : a list of ranked group indexs##
def AR_textrank(doc_topic, informRev):
# 1. Simple topic grouping and group ranking (by the volume only):
# For each group topic, collect the reviews that have the highest prob :
n, k = doc_topic.shape
assert( n == len(informRev) )
reviewG = {}
reviewGInd = {}
for i in range(n):
topic = doc_topic[i].argmax()
if(not reviewG.has_key(topic)):
reviewG[topic] = []
reviewGInd[topic] = []
reviewG[topic].append(informRev[i])
reviewGInd[topic].append(i)
assert(len(reviewG) == k)
rankedInstance = {}
# 2. Text rank within each group:
for j in range(k):
rankedInstance[j] = []
textRankGroup(j, rankedInstance, reviewG[j], reviewGInd[j])
return rankedInstance
# Rank the reviews in the same group by text rank
# Input:
# topic : the topic index
# rankedInstance : a dict = {topic, list of sorted reviews}, this is also the return
# reviews : the corresponding review instances
# reviewInds : the corresponding review indices w.r.t all the informative reviews
# Output:
# rankedInstance : modify it directly for the return
def textRankGroup(topic,rankedInstance, reviews, reviewInds):
# 1. Construct the connectivity graph:
G = constructGraph(reviews)
# 2. Run the page rank on this graph and output the rankings:
score = pageRank(G)
# 3. prepare the indices for return
inds = zip(reviewInds, score)
inds.sort(key = lambda x : x[1], reverse = True)
for i in range(len(inds)):
rankedInstance[topic].append(inds[i])
# Construct the graph of the reviews where the edges present the similarity:
# Input:
# reviews : the review instances in a list
# Output:
# G : coo sparse matrix for connectivity graph
def constructGraph(reviews):
# number of the reviews
n = len(reviews)
data = []
row = []
col = []
cnt = 0
for i in range(n):
for j in range(n):
if(i >= j):
continue
# construct an edge if similar
if(sim(reviews[i], reviews[j])):
cnt += 2
row.append(i)
col.append(j)
data.append(1)
row.append(j)
col.append(i)
data.append(1)
dataArr = np.asarray(data)
rowArr = np.asarray(row)
colArr = np.asarray(col)
print("In construct the graph of reviews ---- Nodes: " + str(len(reviews)) + " Edges: " + str(cnt))
return coo_matrix((dataArr,(rowArr, colArr)), shape=(n, n))
# Compute the page rank score of the given graph
# Adopt the code from: https://gist.github.com/diogojc/1338222/84d767a68da711a154778fb1d00e772d65322187
# Credit: diogojc
# Input:
# G : the coo_matrix (sparse matrix)
# s : the probability of link following
# maxerr : the convergence critiror
# Output:
# score : the page rank scores
def pageRank(G, s = .85, maxerr = .0001):
n = G.shape[0]
# transform G into markov matrix A
A = csc_matrix(G,dtype=np.float)
rsums = np.array(A.sum(1))[:,0]
ri, ci = A.nonzero()
A.data /= rsums[ri]
# bool array of sink states
sink = rsums==0
# Compute pagerank r until we converge
ro, r = np.zeros(n), np.ones(n)
while np.sum(np.abs(r-ro)) > maxerr:
ro = r.copy()
# calculate each pagerank at a time
for i in xrange(0,n):
# inlinks of state i
Ai = np.array(A[:,i].todense())[:,0]
# account for sink states
Di = sink / float(n)
# account for teleportation to state i
Ei = np.ones(n) / float(n)
r[i] = ro.dot( Ai*s + Di*s + Ei*(1-s) )
# return normalized pagerank
return r/float(sum(r))