-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgrab_photos.py
353 lines (324 loc) · 12.5 KB
/
grab_photos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#! /usr/bin/python2
# vim: set fileencoding=utf-8
"""Try its best to retrieve a list of all photos taken in a given city and
insert them with additional information in a mongo database."""
import datetime
import calendar
import CommonMongo as cm
import requests
import flickr_api as flickr_api
from api_keys import FLICKR_KEY as API_KEY
import re
from time import sleep, time
from timeit import default_timer as clock
from httplib import BadStatusLine
import cities
import logging
import os
import arguments
import sys
requests.packages.urllib3.disable_warnings()
now = datetime.datetime.now().strftime('%Y%m%d_%H%M')
HINT = now if len(sys.argv) < 2 else sys.argv[1].strip()
LOG_FILE = 'photos_{}.log'.format(HINT)
TMPDIR = '/tmp' if 'TMPDIR' not in os.environ else os.environ['TMPDIR']
logging.basicConfig(filename=os.path.join(TMPDIR, LOG_FILE),
level=logging.INFO,
format='%(asctime)s [%(levelname)s]: %(message)s')
TITLE_AND_TAGS = re.compile(r'^(?P<title>[^#]*)\s*(?P<tags>(?:#\w+\s*)*)$')
BASE_URL = "https://api.flickr.com/services/rest/"
PER_PAGE = 225
# According to https://secure.flickr.com/services/developer/api/, one api key
# can only make 3600 request per hour so we need to keep track of our usage to
# stay under the limit.
# TODO: move this logic to a RequestSupervisor class.
# NOTE: Actually, it's probably useless since on average, request take more
# than one second to complete
CURRENT_REQ = 0
TOTAL_REQ = 0
START_OF_REQUESTS = 0
REQUEST_INTERVAL = 3600 # in second
MAX_REQUEST = 3600
SF_BL = (37.7123, -122.531)
SF_TR = (37.84, -122.35)
NY_BL = (40.583, -74.040)
NY_TR = (40.883, -73.767)
LD_BL = (51.475, -0.245)
LD_TR = (51.597, 0.034)
VG_BL = (36.80, -78.52)
VG_TR = (38.62, -76.27)
CA_BL = (37.05, -122.21)
CA_TR = (39.59, -119.72)
US_BL = (26, -124.1)
US_TR = (48.6, -66.6)
NANTES_BL = [47.195, -1.61]
NANTES_UR = [47.27, -1.5]
def send_request(**args):
global CURRENT_REQ, START_OF_REQUESTS, TOTAL_REQ
if CURRENT_REQ > MAX_REQUEST:
now = time()
next_time = START_OF_REQUESTS + REQUEST_INTERVAL
if now < next_time:
pause = next_time - now + 2
logging.info("made {} request in {}s: sleeping for {}s{}".format(
CURRENT_REQ, now - START_OF_REQUESTS, pause,
" (but then I come back well rested, raring to go!)"))
sleep(pause)
START_OF_REQUESTS = now
TOTAL_REQ += CURRENT_REQ
CURRENT_REQ = 0
else:
TOTAL_REQ = CURRENT_REQ+1
args['method'] = 'flickr.photos.search'
args['format'] = 'json'
args['api_key'] = API_KEY
args['nojsoncallback'] = 1
req = requests.get(BASE_URL, params=args)
try:
r = req.json()
CURRENT_REQ += 1
return r['photos']['photo'], r['photos']['total']
except BadStatusLine:
raise flickr_api.FlickrError('BadStatusLine')
except:
raise
def parse_title(t):
""" Separate title from terminal hashtags
>>> parse_title('Carnitas Crispy Taco with guac #foodporn #tacosrule')
('Carnitas Crispy Taco with guac', ['foodporn', 'tacosrule'])
>>> parse_title('#foodporn #tacosrule')
('', ['foodporn', 'tacosrule'])
>>> parse_title('Carnitas Crispy Taco with guac')
('Carnitas Crispy Taco with guac', [])
"""
if not '#' in t:
return t, []
m = TITLE_AND_TAGS.match(t)
if m is not None:
title = m.group('title').strip()
tags = m.group('tags').replace('#', '').split()
return title, tags
return t, []
def get_human_tags(s):
"""
>>> get_human_tags(u'iphoneography instagramapp uploaded:by=instagram')
([u'iphoneography', u'instagramapp'], None)
>>> get_human_tags(u'square {foursquare}:{venue}=4bd1db7f9854d13a8260fa4d')
([u'square'], u'4bd1db7f9854d13a8260fa4d')
>>> get_human_tags(u'square foursquare:venue=4bd1db7f9854d13a8260fa4d')
([u'square'], u'4bd1db7f9854d13a8260fa4d')
"""
if not isinstance(s, unicode) or len(s) == 0:
return [], None
tags = []
venue = None
for t in s.split():
if not ':' in t:
tags.append(t)
else:
if venue is None and 'foursquare' in t and 'venue' in t:
venue = t.split('=')[-1]
return tags, venue
def photo_to_dict(p):
global HINT
start = clock()
s = {}
if not ('id' in p and
'owner' in p and
'datetaken' in p and
'dateupload' in p and
'tags' in p and
'title' in p and
'farm' in p and
'secret' in p and
'server' in p and
'longitude' in p and
'latitude' in p):
took = 1000*(clock() - start)
logging.debug('map {} in {:.3f}ms (missing)'.format(p['id'], took))
return None
try:
s['_id'] = int(p['id'])
except ValueError:
logging.info(str(p['id']) + 'is not a valid id')
return None
logging.debug(p['id'])
s['uid'] = p['owner']
try:
s['taken'] = datetime.datetime.strptime(p['datetaken'],
'%Y-%m-%d %H:%M:%S')
except ValueError:
return None
# The 'posted' date represents the time at which the photo was uploaded to
# Flickr. It's always passed around as a unix timestamp (seconds since Jan
# 1st 1970 GMT). It's up to the application provider to format them using
# the relevant viewer's timezone.
try:
s['upload'] = datetime.datetime.fromtimestamp(float(p['dateupload']))
except ValueError:
return None
title, tags = parse_title(p['title'])
s['title'] = title
explicit_tag, venue = get_human_tags(p['tags'])
s['tags'] = explicit_tag + tags
s['venue'] = venue
if len(s['tags']) < 1:
took = 1000*(clock() - start)
logging.debug('map {} in {:.3f}ms (no tag)'.format(s['_id'], took))
return None
# pymongo.errors.OperationFailure: Can't extract geo keys from object,
# malformed geometry?:
# {type: "Point", coordinates: [ "0.000000", 51.486554 ] }
try:
lng, lat = float(p['longitude']), float(p['latitude'])
except ValueError:
return None
s['loc'] = {"type": "Point", "coordinates": [lng, lat]}
s['farm'] = p['farm']
s['server'] = p['server']
s['secret'] = p['secret']
s['hint'] = HINT
took = 1000*(clock() - start)
logging.debug('map {} in {:.3f}ms'.format(s['_id'], took))
return s
def higher_request(start_time, bbox, db, level=0):
""" Try to insert all photos in this region into db by potentially making
recursing call, eventually to lower_request when the region accounts for
less than 4000 photos. """
if level > 20:
logging.warn("Going too deep with {}.".format(bbox))
return 0
_, total = make_request(start_time, bbox, 1, need_answer=True,
max_tries=10)
if level == 0:
logging.info('Aiming for {} photos'.format(total))
if total > 4000:
photos = 0
start = clock()
quads = split_bbox(bbox)
for q in quads:
photos += higher_request(start_time, q, db, level+1)
logging.info('Finish {}: {} photos in {}s'.format(bbox, photos,
clock()-start))
return photos
if total > 5:
return lower_request(start_time, bbox, db, total/PER_PAGE + 1)
logging.warn('Cannot get any photos in {}.'.format(bbox))
return 0
def lower_request(start_time, bbox, db, num_pages):
failed_page = []
total = 0
hstart = clock()
for page in range(1, num_pages+1):
start = clock()
res, _ = make_request(start_time, bbox, page)
if res is None:
failed_page.append(page)
else:
took = ' ({:.4f}s)'.format(clock() - start)
logging.info('Get result for page {}{}'.format(page, took))
saved = save_to_mongo(res, db)
took = ' ({:.4f}s)'.format(clock() - start)
page_desc = 'page {}, {} photos {}'.format(page, saved, took)
logging.info('successfully insert ' + page_desc)
total += saved
sleep(1)
for page in failed_page:
start = clock()
res, _ = make_request(start_time, bbox, page, need_answer=True)
if res is None:
took = ' ({:.4f}s)'.format(clock() - start)
logging.warn('Failed to get page {}{}'.format(page, took))
else:
saved = save_to_mongo(res, photos)
took = ' ({:.4f}s)'.format(clock() - start)
page_desc = 'page {}, {} photos {}'.format(page, saved, took)
logging.info('Finally get ' + page_desc)
total += saved
sleep(1)
logging.info('Finish {}: {} photos in {}s'.format(bbox, total,
clock()-hstart))
return total
def save_to_mongo(photos, collection):
global unique_id
converted = [photo_to_dict(p) for p in photos]
tagged = [p for p in converted if p is not None]
total = len(tagged)
if total > 0:
try:
collection.insert(tagged, continue_on_error=True)
except cm.pymongo.errors.DuplicateKeyError:
# we don't really care, it means that we already have these ones
logging.info('duplicate')
pass
return total
def split_bbox(bbox):
"""
>>> split_bbox(((0, 0), (20, 22)))
[((0, 0), (10, 11)), ((0, 11), (10, 22)), ((10, 0), (20, 11)), ((10, 11), (20, 22))]
"""
bottom_left = bbox[0]
upper_right = bbox[1]
bl_increment = (upper_right[1] - bottom_left[1])/2
ur_increment = (upper_right[0] - bottom_left[0])/2
p1 = bottom_left
p2 = (bottom_left[0], bottom_left[1]+bl_increment)
p3 = (bottom_left[0]+1*ur_increment, bottom_left[1]+0*bl_increment)
p4 = (bottom_left[0]+1*ur_increment, bottom_left[1]+1*bl_increment)
p5 = (bottom_left[0]+1*ur_increment, bottom_left[1]+2*bl_increment)
p6 = (bottom_left[0]+2*ur_increment, bottom_left[1]+1*bl_increment)
p7 = (bottom_left[0]+2*ur_increment, bottom_left[1]+2*bl_increment)
return [(p1, p4), (p2, p5), (p3, p6), (p4, p7)]
def make_request(start_time, bbox, page, need_answer=False, max_tries=3):
""" Queries photos uploaded after 'start_time' in the region defined by
'bbox'. If successful, return all of them in page 'page' along with some
info. Otherwise, return None by default. If 'need_answer' is true, try
again at most 'max_tries' times. """
bbox = '{:.9f},{:.9f},{:.9f},{:.9f}'.format(bbox[0][1], bbox[0][0],
bbox[1][1], bbox[1][0])
min_upload = calendar.timegm(start_time.utctimetuple())
max_upload = calendar.timegm(datetime.datetime.now().utctimetuple())
while max_tries > 0:
error = False
try:
res, t = send_request(min_upload_date=min_upload,
max_upload_date=max_upload,
min_taken_date='1990-07-18 17:00:00',
bbox=bbox, accuracy='16',
content_type=1, # photos only
media="photos", # not video
per_page=PER_PAGE, page=page,
extras='date_upload,date_taken,geo,tags')
except flickr_api.FlickrError as e:
logging.warn('Error getting page {}: {}'.format(page, e))
error = True
except (KeyboardInterrupt, SystemExit):
raise
except:
error = True
if not error and len(res) > 0:
return res, int(t)
if need_answer:
max_tries -= 1
logging.info('insisting on page {}'.format(page))
sleep(5)
else:
return None, 0
logging.warn('Error getting page {}: too much tries'.format(page))
return None, 0
if __name__ == '__main__':
START_OF_REQUESTS = time()
logging.info('initial request')
args = arguments.city_parser().parse_args()
photos = cm.connect_to_db('world', args.host, args.port)[0]['photos']
photos.ensure_index([('loc', cm.pymongo.GEOSPHERE),
('tags', cm.pymongo.ASCENDING),
('uid', cm.pymongo.ASCENDING)])
city = args.city
CITY = (cities.US + cities.EU)[cities.INDEX[city]]
HINT = city
bbox = (CITY[:2], CITY[2:])
start_time = datetime.datetime(2014, 7, 19)
total = higher_request(start_time, bbox, photos)
logging.info('Saved a total of {} photos.'.format(total))
logging.info('made {} requests.'.format(TOTAL_REQ))