forked from regosen/gallery_get
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreddit_get.py
executable file
·175 lines (154 loc) · 6.16 KB
/
reddit_get.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# REDDIT_GET is a tool for downloading all imgur albums and pictures
# that were submitted by a given reddit user.
#
# DEPENDENCIES
# This relies on gallery_get and the imgur_album plugin.
#
# See gallery_get for more info
#
# Rego Sen
# Nov 2, 2013
#
import gallery_get
import os, time, sys, traceback
import datetime, json
from gallery_utils import *
USER_QUERY = "https://www.reddit.com/user/%s/submitted/.json?limit=1000"
DEST_ROOT = gallery_get.DEST_ROOT
safe_makedirs(DEST_ROOT)
# To speed this up, don't craw links with pages that we know aren't galleries
NON_GALLERY_DOMAINS = [
"youtube.com",
"youtu.be",
"www.reddit.com"
]
GALLERY_PATH_EXCEPTIONS = [
"reddit.com/gallery/"
]
def is_individual_imgur(url):
if "/imgur.com/a/" in url:
return False
if "/imgur.com/gallery/" in url:
return False
return True
class RedditGet(object):
def __init__(self, user, dest, flush_jobs=True):
self.user = user
self.dest = dest
self.flush_jobs = flush_jobs
def get_user_json(self):
reddit_json_str = ""
reddit_json = {}
cache_path = self.user + ".json"
query_url = USER_QUERY % self.user
if os.path.exists(cache_path):
print("Getting JSON data from local file (%s)" % cache_path)
reddit_json_str = unicode_safe(open(cache_path,"r").read())
reddit_json = json.loads(reddit_json_str)
else:
print("Requesting JSON data from reddit...")
for i in range(5):
try:
reddit_json_str = urlopen_text(query_url)
reddit_json = json.loads(reddit_json_str)
except URLError:
break
except Exception as e:
if hasattr(e, 'code') and e.code == 404:
break
if "data" in reddit_json:
break
else:
time.sleep(2) # workaround for server-side request frequency issues
if not "data" in reddit_json:
print("ERROR getting json data after several retries! Does the user exist?")
print("If so, try saving the contents of the following to %s and try again." % cache_path)
print(query_url)
return reddit_json
def folder_from_post(self, data):
sdate = datetime.datetime.fromtimestamp(data['created']).strftime("%Y-%m-%d")
title = data['title'].replace('/', '_').replace('\\', '_').strip()
if title:
title = " - " + title
return os.path.join(unicode_safe(self.dest), self.user, gallery_get.safe_str(sdate + title))
# includes special shortcuts for skipping the redirect
def process_reddit_post(self, url, folder):
if "/i.reddituploads.com/" in url:
gallery_get.download_image(url + ".jpg", folder)
elif "/imgur.com/" in url and is_individual_imgur(url):
# Create direct image URL with dummy extension (otherwise it will redirect)
# Then get correct extension from header
img_base = url.replace("/imgur.com/","/i.imgur.com/")
ext = "jpg"
file = urlopen_safe("%s.%s" % (img_base, ext))
real_ext = file.headers.get("content-type")[6:]
if real_ext != "jpeg": # jpeg -> jpg
ext = real_ext
gallery_get.download_image("%s.%s" % (img_base, ext), folder)
elif "/i.imgur.com/" in url:
gallery_get.download_image(url, folder)
else:
# TODO: use Queue or eventlet for launching gallery_get.run_wrapped()
gallery_get.run_wrapped(url, folder, titleAsFolder=True, cacheDest=False, flushJobs=False, allowGenericPlugin=False)
def run(self):
reddit_json = self.get_user_json()
if "data" in reddit_json:
visited_links = set()
for post in reddit_json['data']['children']:
data = post['data']
url = data['url']
domain = urlparse(url).netloc.lower()
if any(x in domain for x in NON_GALLERY_DOMAINS) and not any(x in url for x in GALLERY_PATH_EXCEPTIONS):
print("Skipping non-gallery link: " + url)
continue
elif url.lower() in visited_links:
print("Skipping already visited link: " + url)
continue
else:
visited_links.add(url.lower())
self.process_reddit_post(url, self.folder_from_post(data))
if self.flush_jobs:
gallery_get.flush_jobs()
def run_wrapped(user, dest="", flush_jobs=True):
global DEST_ROOT
try:
if dest:
gallery_get.safeCacheDestination(dest)
DEST_ROOT = unicode_safe(dest)
RedditGet(user, dest or DEST_ROOT, flush_jobs).run()
except:
print('\n' + '-'*60)
traceback.print_exc(file=sys.stdout)
print("Using params: [%s, %s]" % (user, dest))
print('-'*60 + '\n')
print(gallery_get.EXCEPTION_NOTICE)
return os.path.join(DEST_ROOT, user)
def run_prompted():
user = str_input("Input reddit user: ").strip()
if not user:
print("Nothing to do!")
sys.exit()
dest = str_input("Destination (%s): " % encode_safe(DEST_ROOT)).strip()
if dest:
gallery_get.safeCacheDestination(dest)
RedditGet(user, dest or DEST_ROOT).run()
def run(user="", dest=""):
if not user:
run_prompted()
else:
run_wrapped(user, dest)
def main():
global DEST_ROOT
def base_noext(path):
return os.path.splitext(os.path.basename(str(path)))[0] if path else ''
if base_noext(__file__) == base_noext(sys.argv[0]):
### DIRECT LAUNCH (not import)
if len(sys.argv) < 2:
run_prompted()
elif len(sys.argv) == 2:
run_wrapped(sys.argv[1], DEST_ROOT)
else:
DEST_ROOT = unicode_safe(sys.argv[2])
run_wrapped(sys.argv[1], DEST_ROOT)
if __name__ == '__main__':
main()