-
Notifications
You must be signed in to change notification settings - Fork 5
/
crawl.py
89 lines (67 loc) · 2.45 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import requests
import re
import json
from multiprocessing import Pool
url = 'https://nips.cc/Conferences/2022/Schedule?type=Poster'
def get_author_info(author):
author_url = f'https://nips.cc/Conferences/2022/Schedule?showSpeaker={author}'
while True:
try:
r = requests.get(author_url)
break
except:
print('retrying')
continue
text = r.text.encode('unicode_escape').decode('utf-8')
author_info = {}
author_info['name'] = re.search(r'<h3>.*?</h3>', text, re.DOTALL).group(0).split('>')[1].split('<')[0]
author_info['affiliation'] = re.search(r'<h4>.*?</h4>', text, re.DOTALL).group(0).split('>')[1].split('<')[0]
author_info['no'] = author.split('-')[0]
return author_info
def paper_retriever(paper_index):
paper = {}
paper_url = f'https://nips.cc/Conferences/2022/Schedule?showEvent={paper_index}'
while True:
try:
r = requests.get(paper_url)
break
except:
print('retrying')
continue
text = r.text.encode('unicode_escape').decode('utf-8')
paper['title'] = re.search(r'<div class=\"maincardBody\">.*?</div>', text, re.DOTALL).group(0).split('>')[1].split('<')[0]
author_info_list = re.findall(r'showSpeaker\(\'.*?\'\)', text, re.DOTALL)
paper_authors = []
for author in author_info_list:
author_no = author.split('\'')[1]
author_info = get_author_info(author_no)
paper_authors.append(author_info)
paper['authors'] = paper_authors
print(paper['title'])
return paper
def main():
r = requests.get(url)
text = r.text.encode('unicode_escape').decode('utf-8')
with open('nips2022.txt', 'w') as f:
f.write(text)
with open('nips2022.txt', 'r') as f:
text = f.read()
# find all the poster sessions
paper_list = re.findall(r'id=\"maincard_.*?\"', text, re.DOTALL)
paper_index_list = []
for paper in paper_list:
paper_index_list.append(paper.split('_')[1].split('"')[0])
# use multiprocessing to boost
pool = Pool(20)
papers = []
# paper info retrieving
for paper_index in paper_index_list:
paper = pool.apply_async(paper_retriever, args=(paper_index,))
papers.append(paper)
# write to json file
with open('nips2022.json', 'w') as f:
for paper in papers:
f.write(json.dumps(paper.get()) + '\n')
print('done')
if __name__ == '__main__':
main()