forked from ankitshekhawat/pinterest-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
128 lines (109 loc) · 4.07 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
# coding: utf-8
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import copy
import random
import socket
import sys
import time
import unicodedata
import urllib
from subprocess import call
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
try:
from config import PINTEREST_PASSWORD, PINTEREST_USERNAME
except Exception as e:
print(e)
def randdelay(a, b):
time.sleep(random.uniform(a, b))
def u_to_s(uni):
return unicodedata.normalize('NFKD', uni).encode('ascii', 'ignore')
class PinterestHelper(object):
def __init__(self, login, pw, download=True):
self.download = download
# self.browser = webdriver.Firefox()
self.browser = webdriver.Chrome()
self.browser.get("https://www.pinterest.com")
email_elem = self.browser.find_element_by_name('id')
email_elem.send_keys(login)
password_elem = self.browser.find_element_by_name('password')
password_elem.send_keys(pw)
password_elem.send_keys(Keys.RETURN)
randdelay(2, 4)
# def runme(self, url, threshold=500):
def runme(self, url):
if len(sys.argv) > 2:
threshold = int(sys.argv[2])
else:
threshold = 1
final_results = []
previmages = []
tries = 0
try:
self.browser.get(url)
while threshold > 0:
try:
results = []
images = self.browser.find_elements_by_tag_name("img")
if images == previmages:
tries += 1
else:
tries = 0
if tries > 20:
return final_results
for i in images:
src = i.get_attribute("src")
if src:
if src.find("/236x/") != -1 or src.find("/474x/") != 1:
print("src")
print(src)
src = src.replace("/236x/", "/736x/")
src = src.replace("/474x/", "/736x/")
results.append(u_to_s(src))
previmages = copy.copy(images)
final_results = list(set(final_results + results))
dummy = self.browser.find_element_by_tag_name('a')
dummy.send_keys(Keys.PAGE_DOWN)
randdelay(0, 1)
threshold -= 1
except StaleElementReferenceException:
threshold -= 1
except (socket.error, socket.timeout):
pass
return final_results
def close(self):
""" Closes the browser """
self.browser.close()
def main():
if len(sys.argv) > 1:
term = sys.argv[1]
else:
print("\n\n[Error] Need arguments in this format:")
print("python scraper.py <search term> <how many scroll down> <destination dir[optional]>\n\n")
exit()
ph = PinterestHelper(PINTEREST_USERNAME, PINTEREST_PASSWORD)
is_url = urllib.parse.urlparse(term)
if is_url.scheme and is_url.netloc:
images = ph.runme(term)
else:
images = ph.runme('http://pinterest.com/search/pins/?q=' + urllib.parse.quote(term))
print("images")
print(images)
ph.close()
with open(term.replace(" ", "") + "_pins.txt", "w") as file:
file.write('\n'.join([i.decode('UTF-8') for i in images]))
if len(sys.argv) > 3:
destination = sys.argv[3]
else:
destination = "./" + term.replace(" ", "")
print("term / destination")
print(term, destination)
# download images
call('aria2c -i ./{}_pins.txt -d {} --continue --auto-file-renaming false'.format(term.replace(" ", ""),
destination),
shell=True)
if __name__ == '__main__':
main()