forked from SpiderClub/haipproxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
86 lines (76 loc) · 2.97 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import time
import requests
from haipproxy.client import ProxyFetcher
from haipproxy.utils import get_redis_conn
from .configs import (
SCORE_MAPS, TTL_MAPS,
SPEED_MAPS, LONGEST_RESPONSE_TIME,
LOWEST_SCORE, TTL_VALIDATED_RESOURCE,
LOWEST_TOTAL_PROXIES, DATA_ALL,
TOTAL_SUCCESS_REQUESTS, REDIS_HOST,
REDIS_PORT, REDIS_PASS,
REDIS_DB)
class Crawler:
timeout = 10
success_req = TOTAL_SUCCESS_REQUESTS
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Host': 'www.zhihu.com'
}
redis_args = {
'host': REDIS_HOST,
'port': REDIS_PORT,
'password': REDIS_PASS,
'db': REDIS_DB
}
client_configs = {
'strategy': 'greedy',
'fast_response': 5,
'score_map': SCORE_MAPS,
'ttl_map': TTL_MAPS,
'speed_map': SPEED_MAPS,
'longest_response_time': LONGEST_RESPONSE_TIME,
'lowest_score': LOWEST_SCORE,
'ttl_validated_resource': TTL_VALIDATED_RESOURCE,
'min_pool_size': LOWEST_TOTAL_PROXIES,
'all_data': DATA_ALL,
'redis_args': redis_args
}
def __init__(self, retries=5):
self.retries = retries
self.fetcher = ProxyFetcher('zhihu', **self.client_configs)
self.conn = get_redis_conn(**self.redis_args)
self.scheme = 'https'
def get(self, url):
tries = 0
while tries < self.retries:
proxy = {self.scheme: self.fetcher.get_proxy()}
while not proxy.get(self.scheme):
time.sleep(0.5)
proxy = {self.scheme: self.fetcher.get_proxy()}
try:
start = time.time() * 1000
resp = requests.get(url, headers=self.headers, proxies=proxy,
timeout=self.timeout, verify=False)
end = time.time() * 1000
if '安全验证' in resp.text:
if proxy:
self.fetcher.proxy_feedback('failure', proxy.get(self.scheme))
print('Current ip is blocked! The proxy is {}'.format(proxy))
tries += 1
continue
else:
print('Request succeeded! The proxy is {}'.format(proxy))
# if you use greedy strategy, you must feedback
self.fetcher.proxy_feedback('success', proxy.get(self.scheme), int(end - start))
# not considering transaction
self.conn.incr(self.success_req, 1)
return resp.text
except Exception as e:
print(e)
print('Request failed!The proxy is {}'.format(proxy))
# it's important to feedback, otherwise you may use the bad proxy next time
self.fetcher.proxy_feedback('failure', proxy.get(self.scheme))
tries += 1
return None