-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathzhihuImgSelenium.py
57 lines (41 loc) · 1.49 KB
/
zhihuImgSelenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# -*- coding: utf-8 -*-
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By
from urllib import request
from bs4 import BeautifulSoup
import re
import time
url = 'https://www.zhihu.com/question/50734809'
driver = webdriver.Firefox()
# driver = webdriver.PhantomJS(executable_path = "img/phantomjs-2.1.1-windows/bin/phantomjs.exe")
driver.get(url)
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//button[@class='Button QuestionMainAction']")))
except Exception as e:
print(e)
finally:
button = driver.find_element_by_xpath("//button[@class='Button QuestionMainAction']")
print(button.text)
button.click()
html = driver.page_source
html = driver.page_source
# html = html.replace("lt;", "<")
# html = html.replace("gt;", "</img>")
bsObj = BeautifulSoup(html, 'html.parser')
dataList = bsObj.findAll(name = 'img', attrs =
{'data-rawwidth':re.compile(r'\d{0,4}'), 'data-original':re.compile(r'https://')})
page = 1
for data in dataList:
print(data.attrs['data-original'])
print("\n-----"+ str(page) +"-------\n")
page = page + 1
imgPage = 1
for data in dataList:
pass
with open('img2/' + str(imgPage) +'.jpg', 'wb') as w:
w.write(request.urlopen(data.attrs['data-original']).read())
print("第 "+str(imgPage)+" 张图片")
imgPage = imgPage + 1