-
Notifications
You must be signed in to change notification settings - Fork 157
/
main.py
executable file
·95 lines (83 loc) · 3.01 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
#=============================================================================
# FileName: main.py
# Desc:
# Author: lizherui
# Email: [email protected]
# HomePage: https://github.com/lizherui
# Version: 0.0.1
# LastChange: 2013-04-21 17:12:36
# History:
#=============================================================================
'''
import re
import requests
import redis
import logging
from BeautifulSoup import BeautifulSoup
LOG_ADDRESS = '/usr/local/var/log/spider_logging.txt' #日志文件地址
LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' #日志格式
LOG_LEVEL = logging.DEBUG #日志级别
REDIS_IP = '127.0.0.1' #Redis的ip
REDIS_PORT = 6379 #Redis的port
REDIS_FREQUENCE = 10 #Redis清空的频率
SPIDER_KEYS = (u'校招', u'应届', u'毕业生', 'Google') #筛选的关键词
def init_log():
logger = logging.getLogger()
handler = logging.FileHandler(LOG_ADDRESS)
formatter = logging.Formatter(LOG_FORMAT)
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(LOG_LEVEL)
return logger
def init_params():
return (
{
'host' : 'http://bbs.byr.cn',
'url' : 'http://bbs.byr.cn/board/JobInfo',
'headers' : {
"X-Requested-With" : "XMLHttpRequest",
},
'href' : "^/article/JobInfo/\d+$",
},
{
'host' : 'http://www.newsmth.net',
'url' : 'http://www.newsmth.net/nForum/board/Career_Campus',
'headers' : {
"X-Requested-With" : "XMLHttpRequest",
},
'href' : "^/nForum/article/Career_Campus/\d+$",
},
)
def spider(rs, host, url, headers, href):
r = requests.get(url, headers = headers)
frs_soup = BeautifulSoup(r.text)
frs_attrs = {
'href' : re.compile(href),
'title' : None,
'target' : None,
}
frs_res = frs_soup.findAll('a', frs_attrs)
for line in frs_res:
#去除置顶贴
if line.parent.parent.get('class') == 'top':
continue
line['href'] = host + line['href']
title = line.string
if filter(lambda x: x in title, SPIDER_KEYS):
rs.sadd('urls', line)
def main():
logger = init_log()
logger.info('spider start!')
rs = redis.Redis(host=REDIS_IP, port=REDIS_PORT)
rs.incr('times')
if int(rs.get('times')) >= REDIS_FREQUENCE:
rs.flushall()
params = init_params()
for param in params :
spider(rs, param['host'], param['url'], param['headers'], param['href'])
logger.info("spider finish!\n")
if __name__ == '__main__':
main()