-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathLexRank.py
78 lines (71 loc) · 2.75 KB
/
LexRank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/python
import networkx as nx
import similarity
from nltk.tokenize import word_tokenize
import numpy as np
import miscellaneous as ms
import itertools
import argparse
#The graph is built using the similarity scores obtained from the idf_modified_cosine() method
#
def build_graph(nodes, threshold, idf):
gr = nx.Graph() #initialize an undirected graph
gr.add_nodes_from(nodes)
nodePairs = list(itertools.combinations(nodes, 2))
#add edges to the graph (weighted by cosine similarity)
for pair in nodePairs:
node1 = pair[0]
node2 = pair[1]
simval = similarity.idf_modified_cosine(word_tokenize(node1), word_tokenize(node2), idf)
if simval > threshold:
gr.add_edge(node1, node2, weight=simval)
return gr
#would have used this method if not for nx.pagerank
'''
def get_Bij(gr):
A = nx.adjacency_matrix(gr, None, weight='weight')
A = A.toarray()
B = []
row_sum = np.sum(A, axis=1)
it = 0
for row in A:
r = []
for ele in row:
r.append(ele/row_sum[it])
it += 1
B.append(r)
'''
#key sentences are obtained
def get_keysentences(graph):
# weight is the similarity value obtained from the idf_modified_cosine
calculated_page_rank = nx.pagerank(graph, weight='weight')
#most important words in ascending order of importance
keysentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=False)
return keysentences
def get_top(data, topn, threshold):
sent_tokens, word_tokens = ms.tokenize(data)
words = list(set(ms.get_words(data)))
N = len(sent_tokens)
idf = similarity.idf(word_tokens, words, N)
matrix = similarity.get_similarity_matrix(word_tokens, idf)
gr = build_graph(sent_tokens, threshold, idf)
keysentences = get_keysentences(gr)
return keysentences[:topn]
if __name__ == "__main__":
#creating a argparser
parser = argparse.ArgumentParser(description="Pass the data fname and threshold value")
parser.add_argument("fname", help="Provide the data file name")
parser.add_argument("threshold", help="Provide the threshold value", default=0.15, type=float)
parser.add_argument("N", help="Top N sentences to be picked", default=10, type=int)
args = parser.parse_args()
with open(args.fname, "r") as f:
data = f.read()
sent_tokens, word_tokens = ms.tokenize(data)
words = list(set(ms.get_words(data)))
N = len(sent_tokens)
idf = similarity.idf(word_tokens, words, N)
matrix = similarity.get_similarity_matrix(word_tokens, idf)
print ("Printing similarity matrix:\n", matrix)
gr = build_graph(sent_tokens, args.threshold, idf)
keysentences = get_keysentences(gr)
print ("Printing Top 10 Key sentences:\n",keysentences[:args.N])