-
Notifications
You must be signed in to change notification settings - Fork 0
/
xx.py
61 lines (49 loc) · 1.59 KB
/
xx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-
import urllib
import urllib2
import cookielib
import re
class WhySpider:
# 初始化爬虫
def __init__(self):
self.cookie_jar = cookielib.CookieJar()
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie_jar))
self.headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0'}
# 发送GET请求
def send_get(self,get_url):
result = ""
try:
my_request = urllib2.Request(url = get_url, headers = self.headers)
result = self.opener.open(my_request).read()
except Exception,e:
print "Exception : =========================",e
return result
# 简化为了加快速度
def request(self,get_url):
#result = ""
#try:
my_request = urllib2.Request(url = get_url, headers = self.headers)
self.opener.open(my_request)
#except Exception,e:
# print "Exception : =========================",e
ws = WhySpider()
# 初始化爬虫对象
ws.__init__()
#正则提取链接http://blog.csdn.net/
url = 'http://blog.csdn.net/neil4'
listurl = re.findall(r'/neil4/article/details/[0-9]{8}',ws.send_get(url))
l2=[]
for i in listurl:
if not i in l2:
l2.append(i)
i = 0
for list in l2:
l2[i]='http://blog.csdn.net'+list
i+=1
for i in range(10000):
print "================== " + str(i) +" ======================="
i = i*100
for list in l2:
ws.request(list)
i+=1
print i