-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathscraper.py
55 lines (47 loc) · 2.1 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from selenium import webdriver
from selenium.webdriver.common.by import By
import urllib.request
import time
import os
def download_image(person_name, src, seq, dir):
try:
filename = person_name + str(seq) + '.png' # i.e: "JohnTravolta0.png"
image_path = os.path.abspath(os.path.join(os.getcwd(), dir, filename)) # /home/user/Desktop/dirname
urllib.request.urlretrieve(src, image_path) # download image
except Exception:
pass
def browse_page(person_name, pages, dir):
seq = 0 # initialize the file number.
for i in range(pages): # Loop for the number of pages you want to scrape.
try:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') # Scroll to the end of page.
time.sleep(2) # Wait for all the images to load correctly.
images = driver.find_elements(By.CSS_SELECTOR, 'picture > img') # Find all images.
print("Found", len(images), "images")
except:
continue
for image in images: # For each image in one page:
try:
src = image.get_attribute('src') # Get the link
download_image(person_name, src, seq, dir) # And download it to directory
except:
pass
seq += 1
try:
nextpage = driver.find_element_by_css_selector(
'.search-pagination__button-icon--next').click() # Move to next page
except:
pass
time.sleep(2)
if __name__ == '__main__':
person_name = input("Please Provide The Person's Name: \n")
url = input('Please Provide The Page URL: \n')
dir = input('Please Provide The Directory Where The Data Will be Saved: \n')
pages = int(input('Please Provide How Many Pages You Want To Be Scrapped: \n'))
driver = webdriver.Firefox()
# driver = webdriver.Chrome() # IF YOU ARE USING CHROME.
driver.maximize_window()
driver.get(url)
if not os.path.isdir(dir): # If the folder does not exist in working directory, create a new one.
os.makedirs(dir)
browse_page(person_name, pages, dir)