-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdblp_crawler.py
executable file
·358 lines (282 loc) · 11.1 KB
/
dblp_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
# -*- coding: utf-8 -*-
import urllib.request
import xmltodict
from time import sleep
from tqdm import tqdm
from os import remove, rename
from os.path import exists
import os.path
from util import read_csv, write_csv, copy_dic
import unidecode
import html
import argparse
import pickle
def sanitize_text(text):
text = unidecode.unidecode(html.unescape(text.decode('ascii')))
return text.replace("&", " ")
class Cache(object):
def __init__(self, path='data/.cache_queries'):
self.backup_ctr = 0
self.path = path
self.queries = {}
def backup(self):
if os.path.exists(self.path):
backupname = self.path + ".bak"
if os.path.exists(backupname):
os.remove(backupname)
os.rename(self.path, backupname)
def backup_and_save(self, force=False):
self.backup_ctr += 1
if self.backup_ctr % 100 == 0 or force:
self.backup()
with open(self.path, "wb") as f:
pickle.dump(self, f, -1)
@classmethod
def load(cls, path):
if os.path.exists(path):
try:
with open(path, "rb") as f:
cache = pickle.load(f, errors="strict")
return cache
except Exception as ex:
print("Problem loading paper cache, with exception",
ex.__class__.__name__,
"if this persists, re-create %s" % path)
raise ex
else:
print("Cache not found, creating")
return cls(path)
def __contains__(self, key):
return key in self.queries
def add_query(self, key, response):
if key not in self.queries:
self.queries[key] = response
def get_query(self, key):
if key in self.queries:
return self.queries[key]
else:
return None
cache = Cache.load('data/.cache_queries')
def save_cache():
cache.backup_and_save(True)
def request_dblp(query):
url = ('http://dblp.uni-trier.de/%s' % query)
num_retries = 2
while num_retries > 0:
try:
if query in cache:
raw_str = cache.get_query(query)
else:
resource = urllib.request.urlopen(url)
raw_str = resource.read()
cache.add_query(query, raw_str)
cache.backup_and_save()
raw_str = sanitize_text(raw_str)
return xmltodict.parse(raw_str)
except urllib.error.HTTPError as err:
if err.code == 429:
print("HTTP error code", err.code, "reason:", err.reason, "will wait:", err.headers['Retry-After'])
wait = int(err.headers['Retry-After'])
sleep(wait + 10)
num_retries -= 1
else:
raise err
except Exception as err:
print("Something bad happend:", str(err))
print(raw_str)
raise err
# woops we failed
raise Exception("Something wrong happened, we run out of tries")
def request_author_key(author):
data = request_dblp('search/author?xauthor="%s"' %
author.replace(' ', '+'))
# TODO DOES NOT WORK IF THE PERSON HAS ALIASES
if not data['authors']:
return ['']
elif isinstance(data['authors']['author'], list):
return [a['@urlpt'] for a in data['authors']['author']]
else:
return [data['authors']['author']['@urlpt']]
def make_author_link(key):
return "http://dblp.uni-trier.de/pers/hd/" + key
def request_publication_keys(author_key):
data = request_dblp('rec/pers/%s/xk' %
author_key)
return data['dblpperson']['dblpkey'][1:]
def sanitize_coauthors(authors):
sanitized_authors = []
# Check if we have a bunch of letters as authors:
bad_authors = True
for author in authors:
if len(author) != 1:
bad_authors = False
if bad_authors:
return ["".join(authors)]
for author in authors:
if isinstance(author, str):
sanitized_authors.append(author)
else:
sanitized_authors.append(author['#text'])
return sanitized_authors
def sanitize_titles(title):
if isinstance(title, str):
return title.replace(',', ' ')
else:
return (' '.join(title)).replace(',', ' ')
def read_pub(pub_xml):
pub_type = list(pub_xml['dblp'].keys())[0]
year = int(pub_xml['dblp'][pub_type]['year'])
if 'author' in pub_xml['dblp'][pub_type]:
authors = sanitize_coauthors(pub_xml['dblp'][pub_type]['author'])
else:
authors = []
return {'key': pub_xml['dblp'][pub_type]['@key'],
'title': sanitize_titles(pub_xml['dblp'][pub_type]['title']),
'year': year,
'authors': authors}
#http://dblp.uni-trier.de/rec/rdf/conf/isca/KannanGGS17.rdf
def request_publication(key):
xmldict = request_dblp('rec/bibtex/%s.xml' % key)
rdfdict = request_dblp('rec/rdf/%s.rdf' % key)
return xmldict, rdfdict
def request_publications(author_key):
pubs = []
publication_keys = request_publication_keys(author_key)
for key in tqdm(publication_keys):
pub, _ = request_publication(key)
if pub:
pubs.append(read_pub(pub))
return pubs
def is_blacklisted(blacklist, key):
for b in blacklist:
if b in key:
return True
return False
def filter_publications(publications, year):
blacklist = []
return [pub for pub in publications
if (not is_blacklisted(blacklist, pub['key']) and
pub['year'] >= year)]
def get_author_keys(author_list):
authors = read_csv(author_list, ["first_name", "last_name"])
for author in tqdm(authors.items()):
keys = request_author_key(author["first_name"] + "+" +
author["last_name"])
author["keys"] = keys
return authors
def build_author_key_csv(author_key_list, authors):
csv = []
for k, author in authors.items():
row = [k, author['first_name'], author['last_name']]
for key in author['keys']:
csv += [row + [key, 'x', make_author_link(key)]]
write_csv(author_key_list, ['id', 'first_name', 'last_name', 'key',
'valid', 'key_link'], csv)
def build_paper_csv(pub_list, authors, whitelist):
schema = ['id', 'first_name', 'last_name', 'keys',
'valid', 'pub_key', 'pub_title', 'put_year', 'pub_authors']
csv = []
for k, author in authors.items():
row = [k, author['first_name'], author['last_name'],
";".join(author['keys']), 'x']
for pub in author['pubs']:
csv += [row + [pub['key'], pub['title'], pub['year'],
';'.join(pub['authors'])]]
write_csv(pub_list, schema, csv)
def get_paper_list(author_keys, year):
author_keys = read_csv(author_keys, ['first_name', 'last_name',
'key', 'valid', 'key_link'])
authors = {}
# Adding authors with multiple keys
for entry in author_keys:
idx = entry['id']
if idx not in authors and entry['valid']:
authors[idx] = {}
copy_dic(entry, authors[idx], ['first_name', 'last_name'])
authors[idx]['keys'] = [entry['key']]
elif entry['valid']:
authors[idx]['keys'].append(entry['key'])
print("looping over authors")
for idx, v in authors.items():
print(v)
print("processing %d" % idx)
v['pubs'] = []
for k in v['keys']:
v['pubs'].extend(filter_publications(request_publications(k),
year))
return authors
def get_co_authors(paper_csv):
papers = read_csv(paper_csv, ['first_name', 'last_name',
'keys', 'valid', 'pub_key', 'pub_title',
'put_year', 'pub_authors'])
papers_dic = {}
for p in papers:
a_id = int(p['id'])
if a_id not in papers_dic:
papers_dic[a_id] = {}
papers_dic[a_id]['first_name'] = p['first_name']
papers_dic[a_id]['last_name'] = p['last_name']
papers_dic[a_id]['keys'] = set([p['keys']])
papers_dic[a_id]['pubs'] = []
papers_dic[a_id]['co-authors'] = {}
a_dic = papers_dic[a_id]
if p['valid']:
pub = (p['pub_key'], p['pub_title'])
a_dic['pubs'].append(pub)
for co_a in p['pub_authors'].split(";"):
if co_a not in a_dic['co-authors']:
a_dic['co-authors'][co_a] = [pub]
else:
a_dic['co-authors'][co_a].append(pub)
return papers_dic
def main():
parser = argparse.ArgumentParser()
parser.add_argument("mode", choices=["author-keys", "paper-lists",
"list-co-authors", "get-conflicts"])
parser.add_argument("--author-list", help="List with author names")
parser.add_argument("--author-keys",
help="List with author keys to be searched")
parser.add_argument("--paper-list",
help="Author paper list")
parser.add_argument("--co-author-list", help="Co author list")
parser.add_argument("--co-author-year", type=int,
default=2012,
help="Last acceptable year for"
"collaboration without conflict")
# These optaions are not implemented yet
parser.add_argument("--pc-conflicts",
help="File with conflicts listed by pc member")
parser.add_argument("--pc-conflicts-new-csv",
help="New conflicts found in DBLP. CSV to be"
" fed to hotcrp")
parser.add_argument("--pc-conflicts-new-report",
help="New conflicts found in DBLP. File with report to"
" be sent to PC members")
parser.add_argument("--hot-crp-papers",
help="JSON with hotcrp papers, will be used to"
" generate the conflict list")
args = parser.parse_args()
def check_arg(arg, msg):
if not arg:
parser.print_help()
raise ValueError(msg)
if args.mode == 'author-keys':
check_arg(args.author_list, "No author list passed")
authors = get_author_keys(args.author_list)
build_author_key_csv(args.author_keys, authors)
elif args.mode == 'paper-lists':
check_arg(args.author_keys, "No author keys passed")
check_arg(args.paper_list, "No paper list passed")
authors = get_paper_list(args.author_keys,
args.co_author_year)
# build_paper_csv(args.paper_list, authors, args.drop_conf_whitelist)
build_paper_csv(args.paper_list, authors, True)
elif args.mode == 'list-co-authors':
check_arg(args.paper_list, "No paper list passed")
papers_dic = get_co_authors(args.paper_list)
for k, v in papers_dic.items():
print(k)
print(v)
if __name__ == '__main__':
main()
save_cache()