forked from forgeries/qichacha_spider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider3.py
416 lines (397 loc) · 22.6 KB
/
spider3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
# -*- coding: UTF-8 -*-
import inspect
import json
import os
import random
import time
import csv
import urllib
from importlib import reload
from lxml import etree
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver import ActionChains
from openpyxl import load_workbook
import sys
# reload(sys)
# exec("sys.setdefaultencoding('utf-8')")
from config import chrome_driver, phantomjs_driver, log_dir
from headers import random_user_agent
from tools.chaojiying import Chaojiying_Client
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--user-agent='+random_user_agent())
# chrome_options.add_argument('--headless') # 开启无界面模式
chrome_options.add_argument('--disable-gpu') # 禁用gpu,解决一些莫名的问题
# chrome_options.add_argument('blink-settings=imagesEnabled=false')
# chrome_options.add_argument('--no-sandbox')
# 获取当前文件路径
current_path = inspect.getfile(inspect.currentframe())
# 获取当前文件所在目录,相当于当前文件的父目录
dir_name = os.path.dirname(current_path)
# 转换为绝对路径
file_abs_path = os.path.abspath(dir_name)
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=file_abs_path+chrome_driver)
# driver = webdriver.Chrome(executable_path=chrome_driver)
# driver = webdriver.PhantomJS(executable_path=file_abs_path+phantomjs_driver, service_log_path=file_abs_path + log_dir+r'\ghostdriver.log')
driver.maximize_window()
def save():
img = driver.find_element_by_xpath('//div[@class="imgCaptcha_img"]/img')
img_url = img.get_attribute("src")
data = urllib.urlopen(img_url).read()
f = open('a.jpg', 'wb')
f.write(data)
def chaoji():
chaojiying = Chaojiying_Client('账号', '密码', '软件ID') # 用户中心>>软件ID 生成一个替换 96001
im = open('a.jpg', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
datas = chaojiying.PostPic(im, 1902) # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()
ocr = datas['pic_str']
# print datas
print(ocr)
return ocr
# 用于提供模拟匀加速运动的轨迹
def get_track(distance):
track = []
current = 0
mid = distance * 3 / 5
t = 0.2
v = 0
while current < distance:
if current < mid:
a = 3
else:
a = 6
v0 = v
v = v0 + a * t
move = v0 * t + 1 / 2 * a * t * t
current += move
track.append(round(move))
print(track)
return track
# 滑动验证码识别
def slide_discern():
print("滑块验证码验证中。。。")
# try:
# 获取到需滑动的按钮
source = driver.find_element_by_xpath('//*[@id="nc_1_n1z"]')
action = ActionChains(driver)
# 按住左键不放
action.click_and_hold(source).perform()
# 开始滑动
distance = 348 # 模拟以人为速度拖动
track = get_track(distance)
# ttt = [23, 81, 224]
for i in track:
try:
action.move_by_offset(xoffset=i, yoffset=0).perform() # perform --- 执行所有准备好的Action
# action.reset_actions() # reset_actions --- 清空所有准备好的Action,这个需要selenium版本3.0以上
# time.sleep(0.4)
except StaleElementReferenceException as e:
action.release().perform() # 释放鼠标
driver.find_element_by_xpath('//div[@class="errloading"]/span/a').click()
source = driver.find_element_by_xpath('//*[@id="nc_1_n1z"]') # 获取到需滑动的按钮
action = ActionChains(driver)
action.click_and_hold(source).perform() # 按住左键不放
# action.reset_actions() # 清除之前的action
action.move_by_offset(xoffset=i, yoffset=0).perform() # perform --- 执行所有准备好的Action
# 释放鼠标
action.release().perform()
def login_web():
# 打开企查查登录网页
driver.get("https://www.qichacha.com/user_login")
# 加载时间
# time.sleep(3)
# 点击密码登录
driver.find_element_by_xpath('//div[@class="login-panel-head clearfix"]/div[2]').click()
time.sleep(1)
# 找到账号输入框
driver.find_element_by_xpath('//div[@class="form-group"]/input[@id="nameNormal"]').send_keys('username')
time.sleep(1)
# 找到密码输入框
driver.find_element_by_xpath('//div[@class="form-group m-t-md"]/input[@id="pwdNormal"]').send_keys('password')
time.sleep(1)
# slide_discern()
# 滑动条定位
start = driver.find_element_by_xpath('//div[@id="nc_1_n1t"]/span')
# 长按拖拽
action = ActionChains(driver)
# 长按
action.click_and_hold(start)
# 拉动
action.drag_and_drop_by_offset(start, 308, 0).perform()
# 释放鼠标
action.release().perform()
time.sleep(1)
# 保存图片
# save()
# 此处延时为了手动输入验证码(省钱。)
# time.sleep(10)
# 超级鹰识别验证码
# ocr = chaoji()
# # 输入验证码
# driver.find_element_by_xpath('//div[@class="imgCaptcha_text"]/input').send_keys(ocr)
# # 点击提交
# driver.find_element_by_xpath('//div[@id="nc_1_scale_submit"]/span').click()
# 截图
# driver.save_screenshot('web.png')
# 点击登录
driver.find_element_by_xpath('//form[@id="user_login_normal"]/button').click()
# time.sleep(3)
# 关闭弹窗
# driver.find_element_by_xpath('//div[@class="bindwx"]/button/span[1]').click()
cookie_list = driver.get_cookies()
print(cookie_list)
cookie_lst = []
for cookiee in cookie_list:
cookie_lst.append('{}={}'.format(cookiee['name'], cookiee['value']))
cookie = "; ".join(cookie_lst)
print('cookie=============={}'.format(cookie))
# driver.close()
def run():
# 读取本地文件
with open('data.json', encoding='utf-8') as f:
datas = json.load(f)
data_list = []
for i in datas:
data = i[u"企业名称"].encode('utf-8').decode('utf-8')
number = i[u"统一社会信用代码"].encode('utf-8')
# print data
# print type(data)
try:
# 输入公司名
driver.find_element_by_xpath('//div[@class="input-group"]/input[@name="key"]').send_keys(data)
time.sleep(1)
# 点击搜索
driver.find_element_by_xpath('//div[@class="input-group"]/span/input').click()
# time.sleep(random.randint(1, 5))
except Exception as e:
# 切换回原窗口
driver.switch_to.window(driver.window_handles[0])
time.sleep(1)
# 删除原公司名
driver.find_element_by_xpath('//div[@class="input-group"]/a').click()
# time.sleep(random.randint(1, 3))
# 输入下一个公司
driver.find_element_by_xpath('//div[@class="input-group"]/input[@name="key"]').send_keys(data)
time.sleep(1)
# 点击搜索
driver.find_element_by_xpath('//div[@class="input-group"]/span/button').click()
# print driver.title
# 点击第一条
driver.find_element_by_xpath('//table[@class="m_srchList"]/tbody[@id="search-result"]/tr[1]/td[3]/a').click()
# 获取当前窗体的列表
# print(driver.window_handles)
# 切换至第二个窗口
driver.switch_to.window(driver.window_handles[1])
# 此处做判断,输入公司是否包含法律风险
aaa = driver.find_element_by_xpath('//div[@class="risk-panel b-a"]/a[2]').click()
if aaa:
# 点击查看风险
driver.find_element_by_xpath('//div[@class="risk-panel b-a"]/a[2]').click()
time.sleep(1)
# 分析数据源
# 自身风险
url1 = driver.find_element_by_xpath('//div[@class="tab pull-left"]/a[1]').get_attribute('href')
# 关联风险
url2 = driver.find_element_by_xpath('//div[@class="tab pull-left"]/a[2]').get_attribute('href')
# 提示信息
url3 = driver.find_element_by_xpath('//div[@class="tab pull-left"]/a[3]').get_attribute('href')
if url1:
# ---------------------------------------------- 自身风险 ----------------------------------------------
# 点击界面
driver.find_element_by_xpath('//div[@class="tab pull-left"]/a[1]').click()
time.sleep(1)
print('{} 包含 自身风险 数据'.format(i[u"企业名称"].encode('utf-8')))
# 点击 裁判文书 id= judgementLis
judgementList = driver.find_elements_by_xpath('//div[@class="container"]/div[@id="judgementList"]/section/div')
ju = 0
# print type(books)
caipan_list = {}
if judgementList:
print('----------------------命中裁判文书----------------------')
for book in judgementList:
print(ju + 1)
time.sleep(1)
judgementList[ju].click()
# 此处必须加延时,等待网页JS渲染
time.sleep(1)
html_cai = etree.HTML(driver.page_source)
# print driver.page_source
# 弹窗内数据条数
caipan_len = driver.find_elements_by_xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr')
bananas = len(caipan_len)
print('裁判文书含有 {}条数据'.format(bananas - 1))
ban = 2
for banana in range(1, bananas):
try:
# 案件名称
titles_caipan = html_cai.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[{}]/td[2]/a/text()'.format(ban))
titles_caipan = titles_caipan[0].strip()
print('案件名称:', titles_caipan)
# 发布时间
time_caipan = html_cai.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[{}]/td[3]/text()'.format(ban))
time_caipan = time_caipan[0].strip()
print('发布时间:', time_caipan)
# 案件编号
num_caipan = html_cai.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[{}]/td[4]/text()'.format(ban))
num_caipan = num_caipan[0].strip()
print('案件编号:', num_caipan)
# 案件身份
id_caipan = html_cai.xpath('string(//table[@class="ntable ntable-odd"]/tbody/tr[{}]/td[5])'.format(ban))
# id_caipan = id_caipan[0].stirp()
print('案件身份:', id_caipan)
# 执行法院
court_caipan = html_cai.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[{}]/td[6]/text()'.format(ban))
court_caipan = court_caipan[0].strip()
print('执行法院:', court_caipan)
ban += 1
except Exception as e:
# 案件名称
titles_caipan = html_cai.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[2]/td[2]/a/text()')
titles_caipan = titles_caipan[0].strip()
print('案件名称:', titles_caipan)
# 发布时间
time_caipan = html_cai.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[2]/td[3]/text()')
time_caipan = time_caipan[0].strip()
print('发布时间:', time_caipan)
# 案件编号
num_caipan = html_cai.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[2]/td[4]/text()')
num_caipan = num_caipan[0].strip()
print('案件编号:', num_caipan)
# 案件身份
id_caipan = html_cai.xpath('string(//table[@class="ntable ntable-odd"]/tbody/tr[2]/td[5])')
id_caipan = id_caipan
print('案件身份:', id_caipan[0].strip())
# 执行法院
court_caipan = html_cai.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[2]/td[6]/text()')
court_caipan = court_caipan[0].strip()
print('执行法院:', court_caipan)
caipan_list['企业名称:'] = data
caipan_list['统一社会信用代码:'] = number
caipan_list['类型:'] = '法律风险'
caipan_list['标题:'] = titles_caipan
caipan_list['时间:'] = time_caipan
caipan_list[
'内容:'] = '案件名称:' + titles_caipan + '\n' + '发布时间:' + time_caipan + '\n' + '案件编号:' + num_caipan + '\n' + '案件身份:' + id_caipan + '\n' + '执行法院:' + court_caipan
data_list.append(caipan_list)
# print '\n'
# 关闭弹窗
driver.find_element_by_xpath('//div[@class="modal fade in"]/div/div[@class="modal-content risk-modal-list"]/div/button').click()
ju += 1
# 点击 开庭公告 class= panel m-b-xs
notices = driver.find_elements_by_xpath('//div[@class="container"]/section[@class="panel m-b-xs"]/div')
no = 0
kaiting_list = {}
if notices:
print('----------------------命中开庭公告----------------------')
for notice in notices:
print(no + 1)
time.sleep(1)
notices[no].click()
# 此处必须加延时,等待网页JS渲染
time.sleep(1)
html_gonggao = etree.HTML(driver.page_source)
# print driver.page_source
# 弹窗内数据条数
notice_len = driver.find_elements_by_xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr')
apples = len(notice_len)
print('开庭公告含有 {}条数据'.format(apples - 1))
app = 2
for apple in range(1, apples):
try:
# 案号
id_gongao = html_gonggao.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[{}]/td[2]/a/text()'.format(app))
id_gongao = id_gongao[0].strip()
print('案号:', id_gongao)
# 开庭日期
time_gonggao = html_gonggao.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[{}]/td[3]/text()'.format(app))
time_gonggao = time_gonggao[0].strip()
print('开庭日期:', time_gonggao)
# 案由
reason_gonggao = html_gonggao.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[{}]/td[4]/text()'.format(app))
reason_gonggao = reason_gonggao[0].strip()
print('案由:', reason_gonggao)
# 公诉人/原告/上诉人/申请人
plaintiff_gonggao = html_gonggao.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[{}]/td[5]/text()'.format(app))
plaintiff_gonggao = plaintiff_gonggao[0].strip()
print('公诉人/原告/上诉人/申请人:', plaintiff_gonggao)
# 被告人/被告/被上诉人/被申请人
accused_gonggao = html_gonggao.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[{}]/td[6]/text()'.format(app))
accused_gonggao = accused_gonggao[0].strip()
print('被告人/被告/被上诉人/被申请人:', accused_gonggao)
app += 1
except Exception as e:
# 案号
id_gongao = html_gonggao.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[2]/td[2]/a/text()')
id_gongao = id_gongao[0].strip()
print('案号:', id_gongao)
# 开庭日期
time_gonggao = html_gonggao.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[2]/td[3]/text()')
time_gonggao = time_gonggao[0].strip()
print('开庭日期:', time_gonggao)
# 案由
reason_gonggao = html_gonggao.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[2]/td[4]/text()')
reason_gonggao = reason_gonggao[0].strip()
print('案由:', reason_gonggao)
# 公诉人/原告/上诉人/申请人
plaintiff_gonggao = html_gonggao.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[2]/td[5]/text()')
plaintiff_gonggao = plaintiff_gonggao[0].strip()
print('公诉人/原告/上诉人/申请人:', plaintiff_gonggao)
# 被告人/被告/被上诉人/被申请人
accused_gonggao = html_gonggao.xpath('//div[@class="modal-body"]/section/div[2]/table/tbody/tr[2]/td[6]/text()')
accused_gonggao = accused_gonggao[0].strip()
print('被告人/被告/被上诉人/被申请人:{}'.format(accused_gonggao))
kaiting_list['企业名称:'] = data
kaiting_list['统一社会信用代码:'] = number
kaiting_list['类型:'] = '法律风险'
kaiting_list['标题:'] = reason_gonggao
kaiting_list['时间:'] = time_gonggao
kaiting_list[
'内容:'] = '案号:' + id_gongao + '\n' + '开庭日期:' + time_gonggao + '\n' + '案由:' + reason_gonggao + '\n' + '公诉人/原告/上诉人/申请人:' + plaintiff_gonggao + '\n' + '被告人/被告/被上诉人/被申请人:' + accused_gonggao
data_list.append(kaiting_list)
# print '\n'
# 关闭弹窗
driver.find_element_by_xpath('//div[@class="modal fade in"]/div/div[@class="modal-content risk-modal-list"]/div/button').click()
no += 1
# 点击 行政处罚 id= apList
# 点击 税务行政处罚 id= tpList
time.sleep(random.randint(1, 3))
if url2:
print('{} 包含 关联风险 数据'.format(i[u"企业名称"].encode('utf-8')))
# 点击 股权出质 id= PledgeList
# 点击 法定代表人变更 id= OperList
# 点击 大股东变更 id= PartnerList
# 点击 严重违法 id= SeriousViolation
# 点击 经营异常 id= ExceptionList
# 点击 失信被执行人 id= shixinList
# 点击 被执行人 id= zhixingList
# 点击 限制消费 id= stList
time.sleep(random.randint(1, 3))
if url3:
print('{} 包含 提示信息 数据'.format(i[u"企业名称"].encode('utf-8')))
# 点击 法定代表人变更 section> class= panel m-b-xs
# 点击 大股东变更 section> class= panel m-b-xs
else:
print('{} 该公司没有风险提示'.format(i[u"企业名称"].encode('utf-8')))
# 关闭页面换家公司
driver.close()
# 切换回原窗口
driver.switch_to.window(driver.window_handles[0])
# time.sleep(random.randint(1, 5))
# 写入本地文件
with open('111.json', 'w') as f:
json.dump(data_list, f, ensure_ascii=False, indent=2)
# wb可以解决python2 没有newline='',否则数据出现每行空一行
with open('111.csv', 'wb') as f:
# 通过文件对象创建 csv 写入对象
csv_writer = csv.writer(f)
# 写入标题
csv_writer.writerow(data_list[0].keys())
# # 写入内容
for row in data_list:
csv_writer.writerow(row.values())
# l = [i.decode('utf8').encode('gbk') for i in row.values()]
# csv_writer.writerow(l)
f.close()
if __name__ == '__main__':
login_web()
# run()