-
Notifications
You must be signed in to change notification settings - Fork 0
/
idresearch.py
303 lines (223 loc) · 9.21 KB
/
idresearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
import requests
import json
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
from matplotlib import pyplot as plt
from itertools import *
NER = spacy.load("en_core_web_md")
nlp = spacy.load("en_core_web_md")
#----------------------------------------#
def get_doc(doc_url):
"""Get-response
Obtaining responses from semanticscholar api.
Args:
url (str): string. URL from semanticsscholar, arxiv, aclweb, acm, biorxiv are supported.
Returns:
doc_fox (dict): dictionary. Contains metadata of the main article query
doc_paperId (str): string. Contains semanticscholar paperId
reco_fox (dict): dictionary. Contains metadata of the recommended papers using SemanticScholar AI
"""
response = requests.get(f"https://api.semanticscholar.org/graph/v1/paper/URL:{doc_url}?fields=abstract")
doc_fox = response.json()
doc_paperId = doc_fox['paperId']
response = requests.get(f"https://api.semanticscholar.org/recommendations/v1/papers/forpaper/{doc_paperId}?fields=url,abstract,year,citationCount,authors,venue")
reco_fox = response.json()
return doc_fox, doc_paperId, reco_fox
def reco_abstract(i, url):
"""Recommended Paper's Abstract
Obtains article abstract for the queried recommended paper
Args:
i (int): integer. Denotes the index number as seen in the output from get_reco_df or export_reco_csv functions
url (str): string. URL from semanticsscholar, arxiv, aclweb, acm, biorxiv are supported.
Returns:
reco_abs (str): string. Abstract of the queried recommended paper.
"""
doc_fox, doc_paperId, reco_fox = get_doc(url)
reco_abs = reco_fox['recommendedPapers'][i]['abstract']
return reco_abs
def main_abstract(url):
"""Main Paper's Abstract
Obtains article abstract for the queried main paper
Args:
url (str): string. URL from semanticsscholar, arxiv, aclweb, acm, biorxiv are supported.
Returns:
main_abs (str): string. Abstract of the main queried paper.
"""
doc_fox, doc_paperId, reco_fox = get_doc(url)
main_abs = doc_fox['abstract']
return main_abs
def get_ner(raw_abs):
"""Get Name Entity Recognition
Obtains Name Entity type: ORG and PRODUCT from the abstract.
Args:
raw_abs (str): string. Raw text of any article abstract.
Returns:
org_ent (list): list. Gives string type output within a list with ORG type entities.
prod_ent (list): list. Gives string type output within a list with PRODUCT type entities.
"""
text1 = NER(raw_abs)
entities = {key: list(set(map(lambda x: str(x), g))) for key, g in groupby(sorted(text1.ents, key=lambda x: x.label_), lambda x: x.label_)}
org_ent = []
prod_ent = []
try:
org_ent = entities['ORG']
except KeyError:
pass
try:
prod_ent = entities['PRODUCT']
except KeyError:
pass
return org_ent, prod_ent
def get_reco_df(url):
"""Get Recommended Articles dataframe
Obtains a dataframe of all the recommended articles (url, abstract, year, publisher, citation_count)
Args:
url (str): string. URL from semanticsscholar, arxiv, aclweb, acm, biorxiv are supported.
Returns:
new_df (dataframe): dataframe. Dataframe contatinig information on all recommended articles.
"""
doc_fox, doc_paperId, reco_fox = get_doc(url)
dict1 = [dict(paperId=a1['paperId'], url=a1['url'], abstract=a1['abstract'], year=a1['year'], publisher=a1['venue'], citation_count=a1['citationCount']) for a1 in reco_fox['recommendedPapers']]
df1 = pd.DataFrame(dict1)
new_df = df1.sort_values(by = 'citation_count', ascending=False, ignore_index=False)
return new_df
def export_reco_csv(url):
"""Export Recommended Articles dataframe
Exports a csv of all the recommended articles (url, abstract, year, publisher, citation_count)
Args:
url (str): string. URL from semanticsscholar, arxiv, aclweb, acm, biorxiv are supported.
Exports:
new_df (csv): csv. Exports a csv containing information on all recommended articles. Filename: recolist.csv
"""
new_df = get_reco_df(url)
new_df.to_csv('recolist.csv')
def summarize_doc(raw_abs, n):
"""Summarize the abstract document
Summarizes the abstract by assigning weights to each sentence (based on common words and length of sentences).
Args:
raw_abs (str): string. Raw text of any article abstract.
n (int): integer. Number of lines for the summary.
Returns:
summary (str): string. Summary of the abstract in 'n' number of lines, based on the arguement.
"""
doc1 = nlp(raw_abs)
keywords = []
stopwords = list(STOP_WORDS)
pos_tag =['PROPN', 'ADJ', 'NOUN', 'VERB', 'NUM', 'SYM', 'X']
for token in doc1:
if(token.text in stopwords or token.text in punctuation):
continue
if(token.pos_ in pos_tag):
keywords.append(token.text)
freq_word = Counter(keywords)
max_freq = Counter(keywords).most_common(1)[0][1]
for word in freq_word.keys():
freq_word[word] = (freq_word[word]/max_freq)
sent1 = {}
for se in doc1.sents:
for word in se:
if word.text in freq_word.keys():
if se in sent1.keys():
sent1[se] = sent1[se] + freq_word[word.text]
else:
sent1[se] = freq_word[word.text]
summary = nlargest(n, sent1, sent1.get)
return summary
def doc_stats(raw_abs):
"""Abstract Statistics
Finds most frequently used keywords in the abstract.
Args:
raw_abs (str): string. Raw text of any article abstract.
Returns:
freq_ans (dict): dictionary. Shows the word and number of occurences in the abstract.
"""
doc1 = nlp(raw_abs)
keywords = []
stopwords = list(STOP_WORDS)
pos_tag =['PROPN', 'ADJ', 'NOUN', 'VERB', 'NUM', 'SYM', 'X']
for token in doc1:
if(token.text in stopwords or token.text in punctuation):
continue
if(token.pos_ in pos_tag):
keywords.append(token.text)
freq_word = Counter(keywords)
freq_ans = freq_word.most_common(5)
return freq_ans
def reco_authors(url, num):
"""Get list of authors
Provides the list of authors and the number of times an author's paper has been recommended in decending order.
Args:
url (str): string. URL from semanticsscholar, arxiv, aclweb, acm, biorxiv are supported.
num (int): integer. Number of author names in the output.
Returns:
occurence_common (dict): dictionary. Returns a dictionary with the Author name and number of times the author's article has been recommended.
"""
doc_fox, doc_paperId, reco_fox = get_doc(url)
authors1 = []
authorids1 = []
for i in range(len(reco_fox['recommendedPapers'])):
x1 = reco_fox['recommendedPapers'][i]['authors']
for j in range(len(x1)):
authors1.append(x1[j]['name'])
authorids1.append(x1[j]['authorId'])
occurence_count = Counter(authors1)
occurence_common = dict(occurence_count.most_common(num))
return occurence_common
def plot_CitationCount_url(url):
"""Plot Number of Papers vs Citation Count
Plots a Number of Papers vs Citation Count histogram.
Args:
url (str): string. URL from semanticsscholar, arxiv, aclweb, acm, biorxiv are supported.
Returns:
plot: Returns a matplotlib plot.
"""
new_df = get_reco_df(url)
new_df['citation_count'].plot(kind='hist',bins=25)
plt.xlabel('No of citations')
plt.ylabel('Number of papers')
plt.title("Number of citations for recommended papers")
plt.show()
def plot_CitationCount_df(new_df):
"""Plot Number of Papers vs Citation Count
Plots a Number of Papers vs Citation Count Year histogram.
Args:
new_df (dataframe): Pandas dataframe exported using get_reco_df function.
Returns:
plot: Returns a matplotlib plot.
"""
new_df['citation_count'].plot(kind='hist',bins=25)
plt.xlabel('No of citations')
plt.ylabel('Number of papers')
plt.title("Number of citations for reccomended papers")
plt.show()
def plot_YearTrend_url(url):
"""Plot Number of Papers vs Publication Year
Plots a Number of Papers vs Publication Year histogram.
Args:
url (str): string. URL from semanticsscholar, arxiv, aclweb, acm, biorxiv are supported.
Returns:
plot: Returns a matplotlib plot.
"""
new_df = get_reco_df(url)
new_df['year'].plot(kind='hist',bins=25)
plt.ylabel('Number of papers')
plt.xlabel('year')
plt.title("Trend of development of field")
plt.show()
def plot_YearTrend_df(new_df):
"""Plot Number of Papers vs Publication Year
Plots a Number of Papers vs Publication Year histogram.
Args:
new_df (dataframe): Pandas dataframe exported using get_reco_df function.
Returns:
plot: Returns a matplotlib plot.
"""
new_df['year'].plot(kind='hist',bins=25)
plt.ylabel('Number of papers')
plt.xlabel('year')
plt.title("Trend of development of field")
plt.show()