forked from lryong/tutorials-from-runoob
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDownloadTutorial.py
64 lines (60 loc) · 1.48 KB
/
DownloadTutorial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/python
#encoding:utf-8
# import os
import re
import urllib2
from lxml import etree
import StringIO
import gzip
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getcontent(url): #处理压缩网页,返回页面内容
mainpage = urllib2.urlopen(url)
type = mainpage.info().get('Content-Encoding')
if type == 'gzip':
mainpage = urllib2.urlopen(url)
tmp = StringIO.StringIO(mainpage.read())
data = gzip.GzipFile(fileobj=tmp)
else:
data = mainpage
return data
def gettutorial(title,url):
print '[**]' + url
num = 0
file = open(title+'.html','a')
content = getcontent(url)
html = etree.HTML(content.read())
url = html.xpath('/html/body/div[3]/div/div[1]/div[2]/div//a/@href')
pre = ''
for item in url:
mytest = item.split('/')
if len(mytest) == 3:
pre = mytest[1]
elif len(mytest) == 1:
item = '/' + pre + '/' + item
else:
continue
item = 'http://www.runoob.com' + item
print '[**]\t' + item
data = getcontent(item)
if num == 0:
for tmp in data.readlines()[:-1]:
file.write(tmp)
else:
for tmp in data.readlines()[24:-1]:
file.write(tmp)
num +=1
file.write('</html>')
file.close()
data = getcontent('http://www.runoob.com')
html = etree.HTML(data.read())
theme = html.xpath('/html/body/div[4]/div/div[2]//a')
num = 0
for a in theme:
title = theme[num].xpath('./h4/text()')[0].strip()
title = re.sub('/','-',title)
suburl = theme[num].xpath('./@href')[0].strip()
suburl = str(suburl)
gettutorial(title,suburl)
num += 1