-
Notifications
You must be signed in to change notification settings - Fork 2
/
generator.py
51 lines (38 loc) · 1.58 KB
/
generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python
import re, os, sqlite3
from bs4 import BeautifulSoup
from urllib import parse
db = sqlite3.connect('./Racket.docset/Contents/Resources/docSet.dsidx')
cur = db.cursor()
try: cur.execute('DROP TABLE searchIndex;')
except: pass
cur.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);')
cur.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);')
docpath = './Racket.docset/Contents/Resources/Documents/'
def fetch(f):
filename = os.path.join(root, f)
page = open(filename).read()
soup = BeautifulSoup(page)
for group in soup.select('.SVInsetFlow'):
type_ele = group.find('div', { 'class': 'RBackgroundLabelInner' })
_path = None
if type_ele:
_type = type_ele.text
else:
return
for tag in group.select('.RktValDef'):
_name = tag.text
if tag['href'].startswith('#'):
_path = filename.replace(docpath, '') + tag['href']
elif 'local-redirect' in tag['href']:
matches = re.search('doc=(.*)&rel=(.*)', tag['href'])
_path = matches.group(1) + '/' + parse.unquote(matches.group(2))
if _path:
print('type: %s, name: %s, path: %s' % (_type, _name, _path))
cur.execute('INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?,?,?)', (_name, _type.title(), _path))
for root, dirs, files in os.walk(docpath):
for f in files:
if f.endswith('.html') and 'demo' not in root:
fetch(f)
db.commit()
db.close()