forked from priyankamandikal/arowf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
backlog_script.py
46 lines (40 loc) · 1.46 KB
/
backlog_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import requests
from bs4 import BeautifulSoup
import traceback
from os import path, listdir
url = 'https://en.wikipedia.org/wiki/Category:Wikipedia_articles_in_need_of_updating_from_May_2016'
categ = path.basename(url)
recdir = 'records' + path.sep
def nextrecord():
try:
records = listdir(recdir)
record = 1+int(max(records)[:9])
### todo: check for improperly named files
return format(record, '09')
except:
return format(1, '09')
if __name__ == '__main__':
try:
r = requests.get(url)
bs = BeautifulSoup(r.text)
cnt = 0;
for catgroup in bs.find_all('div', 'mw-category-group'):
for entry in catgroup.find_all('li'):
a = entry.find('a')
link = 'https://en.wikipedia.org' + a.get('href')
title = a.get('title').encode('utf8')
fn = recdir + nextrecord() + 'q'
print fn
if path.exists(fn):
print('A billion questions reached! Answer!')
exit()
f = open(fn, 'w')
f.write('The article <a href="' + link + '">' + title + '</a> is in ' + categ + '.</br>How would you update it?<br/><a style="float:right;" href="'+link+'">'+link+'</a><iframe src="' + link + '" style="height: 40%; width: 100%;">[Can not display <a href="' + link + '">' + link + '</a> inline as an iframe here.]</iframe>')
#f.write(categ + '\n' + title + '\n' + link +'\n') # in file, print <backlog category>, <title>, <link>
f.close()
cnt = cnt+1
if(cnt==5):
exit()
except:
print "Error while parsing backlogs"
print traceback.format_exc()