forked from epic-kitchens/epic-kitchens-download-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathepic_downloader.py
executable file
·444 lines (354 loc) · 21.3 KB
/
epic_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
import argparse
import hashlib
import os
import csv
import shutil
import sys
import warnings
try:
import urllib.request
import urllib.error
from pathlib import Path
except ImportError as e:
print('Error: {}'.format(e))
print('This script works with Python 3.5+. Please use a more recent version of Python')
exit(-1)
def print_header(header, char='*'):
print()
print(char * len(header))
print(header)
print(char * len(header))
print()
class EpicDownloader:
def __init__(self,
epic_55_base_url='https://data.bris.ac.uk/datasets/3h91syskeag572hl6tvuovwv4d',
epic_100_base_url='https://data.bris.ac.uk/datasets/2g1n6qdydwa9u22shpxqzp0t8m',
masks_base_url='https://data.bris.ac.uk/datasets/3l8eci2oqgst92n14w2yqi5ytu',
base_output=str(Path.home()),
splits_path_epic_55='data/epic_55_splits.csv',
splits_path_epic_100='data/epic_100_splits.csv',
md5_path='data/md5.csv',
errata_path='data/errata.csv',
errata_only=False):
self.base_url_55 = epic_55_base_url.rstrip('/')
self.base_url_100 = epic_100_base_url.rstrip('/')
self.base_url_masks = masks_base_url.rstrip('/')
self.base_output = os.path.join(base_output, 'EPIC-KITCHENS')
self.videos_per_split = {}
self.challenges_splits = []
self.md5 = {'55': {}, '100': {}, 'errata': {}}
self.errata = {}
self.parse_splits(splits_path_epic_55, splits_path_epic_100)
self.load_md5(md5_path)
self.load_errata(errata_path)
self.errata_only = errata_only
def load_errata(self, path):
with open(path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=',')
for row in reader:
self.errata[row['rdsf_path']] = row['dropbox_path']
def load_md5(self, path):
with open(path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=',')
for row in reader:
v = row['version']
self.md5[v][row['file_remote_path']] = row['md5']
@staticmethod
def download_file(url, output_path):
Path(os.path.dirname(output_path)).mkdir(parents=True, exist_ok=True)
try:
with urllib.request.urlopen(url) as response, open(output_path, 'wb') as output_file:
print('Downloading\nfrom {}\nto {}'.format(url, output_path))
shutil.copyfileobj(response, output_file)
except Exception as e:
print('Could not download file from {}\nError: {}'.format(url, str(e)))
@staticmethod
def parse_bool(b):
return b.lower().strip() in ['true', 'yes', 'y']
@staticmethod
def md5_checksum(path):
hash_md5 = hashlib.md5()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def parse_splits(self, epic_55_splits_path, epic_100_splits_path):
epic_55_videos = {}
with open(epic_55_splits_path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=',')
for row in reader:
epic_55_videos[row['video_id']] = row['split']
with open(epic_100_splits_path) as csvfile:
reader = csv.DictReader(csvfile, delimiter=',')
self.challenges_splits = [f for f in reader.fieldnames if f != 'video_id']
for f in self.challenges_splits:
self.videos_per_split[f] = []
for row in reader:
video_id = row['video_id']
parts = video_id.split('_')
participant = int(parts[0].split('P')[1])
extension = len(parts[1]) == 3
epic_55_split = None if extension else epic_55_videos[video_id]
v = {'video_id': video_id, 'participant': participant, 'participant_str': parts[0],
'extension': extension, 'epic_55_split': epic_55_split}
for split in self.challenges_splits:
if self.parse_bool(row[split]):
self.videos_per_split[split].append(v)
def download_consent_forms(self, video_dicts):
files_55 = ['ConsentForm.pdf', 'ParticipantsInformationSheet.pdf']
for f in files_55:
output_path = os.path.join(self.base_output, 'ConsentForms', 'EPIC-55-{}'.format(f))
url = '/'.join([self.base_url_55, 'ConsentForms', f])
self.download_file(url, output_path)
output_path = os.path.join(self.base_output, 'ConsentForms', 'EPIC-100-ConsentForm.pdf')
url = '/'.join([self.base_url_100, 'ConsentForms', 'consent-form.pdf'])
self.download_file(url, output_path)
def download_videos(self, video_dicts, file_ext='MP4'):
def epic_55_parts(d):
return ['videos', d['epic_55_split'], d['participant_str'], '{}.{}'.format(d['video_id'], file_ext)]
def epic_100_parts(d):
return [d['participant_str'], 'videos', '{}.{}'.format(d['video_id'], file_ext)]
self.download_items(video_dicts, epic_55_parts, epic_100_parts)
def download_rgb_frames(self, video_dicts, file_ext='tar'):
def epic_55_parts(d):
return ['frames_rgb_flow', 'rgb', d['epic_55_split'], d['participant_str'],
'{}.{}'.format(d['video_id'], file_ext)]
def epic_100_parts(d):
return [d['participant_str'], 'rgb_frames', '{}.{}'.format(d['video_id'], file_ext)]
self.download_items(video_dicts, epic_55_parts, epic_100_parts)
def download_flow_frames(self, video_dicts, file_ext='tar'):
def epic_55_parts(d):
return ['frames_rgb_flow', 'flow', d['epic_55_split'], d['participant_str'],
'{}.{}'.format(d['video_id'], file_ext)]
def epic_100_parts(d):
return [d['participant_str'], 'flow_frames', '{}.{}'.format(d['video_id'], file_ext)]
self.download_items(video_dicts, epic_55_parts, epic_100_parts)
def download_object_detection_images(self, video_dicts, file_ext='tar'):
# these are available for epic 55 only, but we will use the epic_100_parts func to create a consistent output
# path
epic_55_dicts = {k: v for k, v in video_dicts.items() if not v['extension']}
def epic_55_parts(d):
return ['object_detection_images', d['epic_55_split'], d['participant_str'],
'{}.{}'.format(d['video_id'], file_ext)]
def epic_100_parts(d):
return [d['participant_str'], 'object_detection_images', '{}.{}'.format(d['video_id'], file_ext)]
self.download_items(epic_55_dicts, epic_55_parts, epic_100_parts)
def download_metadata(self, video_dicts, file_ext='csv'):
epic_100_dicts = {k: v for k, v in video_dicts.items() if v['extension']}
def epic_100_accl_parts(d):
return [d['participant_str'], 'meta_data', '{}-accl.{}'.format(d['video_id'], file_ext)]
def epic_100_gyro_parts(d):
return [d['participant_str'], 'meta_data', '{}-gyro.{}'.format(d['video_id'], file_ext)]
self.download_items(epic_100_dicts, None, epic_100_accl_parts)
self.download_items(epic_100_dicts, None, epic_100_gyro_parts)
def download_masks(self, video_dicts, file_ext='pkl'):
def remote_object_hands_parts(d):
return ['hand-objects', d['participant_str'], '{}.{}'.format(d['video_id'], file_ext)]
def remote_masks_parts(d):
return ['masks', d['participant_str'], '{}.{}'.format(d['video_id'], file_ext)]
def output_object_hands_parts(d):
return [d['participant_str'], 'hand-objects', '{}.{}'.format(d['video_id'], file_ext)]
def output_masks_parts(d):
return [d['participant_str'], 'masks', '{}.{}'.format(d['video_id'], file_ext)]
# data is organised in the same way for both epic-55 and the extension so we pass the same functions
self.download_items(video_dicts, remote_object_hands_parts, remote_object_hands_parts,
from_url=self.base_url_masks, output_parts=output_object_hands_parts)
self.download_items(video_dicts, remote_masks_parts, remote_masks_parts,
from_url=self.base_url_masks, output_parts=output_masks_parts)
def download_items(self, video_dicts, epic_55_parts_func, epic_100_parts_func, from_url=None, output_parts=None):
for video_id, d in video_dicts.items():
extension = d['extension']
remote_parts = epic_100_parts_func(d) if extension else epic_55_parts_func(d)
erratum_url = self.errata.get('/'.join(remote_parts), None)
if erratum_url is None:
if self.errata_only:
continue
if from_url is None:
base_url = self.base_url_100 if extension else self.base_url_55
else:
base_url = from_url
url = '/'.join([base_url] + remote_parts)
version = '100' if extension else '55'
else:
print_header('~ Going to download an erratum now! ~', char='~')
url = erratum_url
version = 'errata'
output_parts = epic_100_parts_func if output_parts is None else output_parts
output_path = os.path.join(self.base_output, *output_parts(d))
if self.file_already_downloaded(output_path, remote_parts, version):
print('This file was already downloaded, skipping it: {}'.format(output_path))
else:
self.download_file(url, output_path)
def file_already_downloaded(self, output_path, parts, version):
if not os.path.exists(output_path):
return False
key = '/'.join(parts)
remote_md5 = self.md5[version].get(key, None)
if remote_md5 is None:
return False
local_md5 = self.md5_checksum(output_path) # we already checked file exists so we are safe here
return local_md5 == remote_md5
def download(self, what=('videos', 'rgb_frames', 'flow_frames'), participants='all', specific_videos='all', splits='all',
challenges='all', extension_only=False, epic55_only=False):
video_dicts = {}
if splits == 'all' and challenges == 'all':
download_splits = self.challenges_splits
elif splits == 'all':
download_splits = [cs for cs in self.challenges_splits for c in challenges if c == cs.split('_')[0]]
elif challenges == 'all':
download_splits = [cs for cs in self.challenges_splits for s in splits if s in cs.partition('_')[2]]
else:
download_splits = [cs for cs in self.challenges_splits for c in challenges for s in splits
if c == cs.split('_')[0] and s in cs.partition('_')[2]]
for ds in download_splits:
if not extension_only and not epic55_only:
vl = self.videos_per_split[ds]
else:
# we know that only one between extension_only and epic_55_only will be True
vl = [v for v in self.videos_per_split[ds] if (extension_only and v['extension']) or
(epic55_only and not v['extension'])]
if participants != 'all' and specific_videos == 'all':
if type(participants[0]) == int:
vl = [v for v in vl if v['participant'] in participants]
else:
vl = [v for v in vl if v['participant_str'] in participants]
if specific_videos != 'all' and participants == 'all':
vl = [v for v in vl if v['video_id'] in specific_videos]
elif participants != 'all' and specific_videos != 'all':
if type(participants[0]) == int:
vp = [v for v in vl if v['participant'] in participants]
else:
vp = [v for v in vl if v['participant_str'] in participants]
vs = [v for v in vl if v['video_id'] in specific_videos]
vl = vp + vs
video_dicts.update({v['video_id']: v for v in vl}) # We use a dict to avoid duplicates
# sorting the dictionary
video_dicts = {k: video_dicts[k] for k in sorted(video_dicts.keys())}
if epic55_only:
source = 'EPIC 55'
elif extension_only:
source = 'EPIC 100 (extension only)'
else:
source = 'EPIC 100'
what_str = ', '.join(' '.join(w.split('_')) for w in what)
if participants == 'all':
participants_str = 'all'
elif type(participants[0]) == int:
participants_str = ', '.join(['P{:02d}'.format(p) for p in participants])
else:
participants_str = ', '.join([f'{p}' for p in participants])
videos_str = 'all' if specific_videos == 'all' else ', '.join([f"{v}" for v in specific_videos])
if not self.errata_only:
print('Going to download: {}\n'
'for challenges: {}\n'
'splits: {}\n'
'participants: {}\n'
'specific videos: {}\n'
'data source: {}'.format(what_str, challenges, splits, participants_str, videos_str, source))
for w in what:
if not self.errata_only:
print_header('| Downloading {} now |'.format(' '.join(w.split('_'))), char='-')
func = getattr(self, 'download_{}'.format(w))
func(video_dicts)
def create_parser():
parser = argparse.ArgumentParser(add_help=True)
parser.add_argument('--output-path', nargs='?', type=str, default=Path.home(),
help='Path where to download files. Default is {}'.format(Path.home()))
parser.add_argument('--videos', dest='what', action='append_const', const='videos', help='Download videos')
parser.add_argument('--rgb-frames', dest='what', action='append_const', const='rgb_frames',
help='Download rgb frames')
parser.add_argument('--flow-frames', dest='what', action='append_const', const='flow_frames',
help='Download optical flow frames')
parser.add_argument('--object-detection-images', dest='what', action='append_const',
const='object_detection_images', help='Download object detection images (only for EPIC 55)')
parser.add_argument('--masks', dest='what', action='append_const',
const='masks', help='Download Mask R-CNN masks and hand-object correspondences')
parser.add_argument('--metadata', dest='what', action='append_const',
const='metadata', help='Download GoPro\'s metadata (only for EPIC 100)')
parser.add_argument('--consent-forms', dest='what', action='append_const', const='consent_forms',
help='Download consent_forms')
parser.add_argument('--participants', nargs='?', type=str, default='all',
help='Specify participants IDs. You can specify a single participant, e.g. `--participants 1` '
'or a comma-separated list of them, e.g. `--participants 1,2,3`')
parser.add_argument('--specific-videos', nargs='?', type=str, default='all', help='Specify video IDs. You can specify a single video, e.g. `--participants P01_01` '
'or a comma-separated list of them, e.g. `--participants P01_01,P01_02,P01_03`')
parser.add_argument('--extension-only', action='store_true', help='Download extension only')
parser.add_argument('--epic55-only', action='store_true', help='Download only EPIC 55\'s data')
parser.add_argument('--action-recognition', dest='challenges', action='append_const', const='ar',
help='Download data for the action recognition challenge')
parser.add_argument('--domain-adaptation', dest='challenges', action='append_const', const='da',
help='Download data for the domain adaptation challenge')
parser.add_argument('--action-retrieval', dest='challenges', action='append_const', const='cmr',
help='Download data for the action retrieval challenge')
parser.add_argument('--train', dest='splits', action='append_const', const='train',
help='Download data from the training split (for action recognition and action retrieval)')
parser.add_argument('--val', dest='splits', action='append_const', const='val',
help='Download data from the validation split (for action recognition only)')
parser.add_argument('--test', dest='splits', action='append_const', const='test',
help='Download data from the testing split (for action recognition and action retrieval)')
parser.add_argument('--source-train', dest='splits', action='append_const', const='source_train',
help='Download data from the source training split for domain adaptation')
parser.add_argument('--source-test', dest='splits', action='append_const', const='source_test',
help='Download data from the source testing split for domain adaptation')
parser.add_argument('--target-train', dest='splits', action='append_const', const='target_train',
help='Download data from the target training split for domain adaptation')
parser.add_argument('--target-test', dest='splits', action='append_const', const='target_test',
help='Download data from the target testing split for domain adaptation')
parser.add_argument('--val-source-train', dest='splits', action='append_const', const='val_source_train',
help='Download the smaller source train set used to validate hyper-parameters for domain '
'adaptation')
parser.add_argument('--val-source-test', dest='splits', action='append_const', const='val_source_test',
help='Download the smaller source test set used to validate hyper-parameters for domain '
'adaptation')
parser.add_argument('--val-target-train', dest='splits', action='append_const', const='val_target_train',
help='Download the smaller target train set used to validate hyper-parameters for domain '
'adaptation')
parser.add_argument('--val-target-test', dest='splits', action='append_const', const='val_target_test',
help='Download the smaller target test set used to validate hyper-parameters for domain '
'adaptation')
parser.add_argument('--errata', action='store_true', help='Download only errata files')
return parser
def parse_args(parser):
args = parser.parse_args()
assert not (args.extension_only and args.epic55_only), \
'You can specify either --extension_only or --epic55_only, but not both'
assert not (args.extension_only and ['object_detection_images'] == args.what), \
'Object detection images are available only for EPIC 55'
assert not (args.epic55_only and ['metadata'] == args.what), \
'GoPro\'s metadata is available only for EPIC 100'
if args.what is None:
args.what = ('videos', 'rgb_frames', 'flow_frames', 'object_detection_images', 'metadata', 'consent_forms',
'masks')
if args.challenges is None:
args.challenges = 'all'
if args.splits is None:
args.splits = 'all'
if args.participants != 'all':
args.participants = [p.strip() for p in args.participants.split(',')]
string_check = len(args.participants[0]) == 3 and args.participants[0][0] == 'P'
int_check = len(args.participants[0]) == 1
assert int_check or string_check, 'Invalid participant format. ' \
'Please enter the participants in numerical (1), ' \
'or string format (P01).'
args.participants = list(map(int, args.participants)) if int_check else args.participants
if args.specific_videos != 'all':
args.specific_videos = [v.strip() for v in args.specific_videos.split(',')]
assert all(
len(x.split("_")[0]) == 3
and (len(x.split("_")[1]) == 2 or len(x.split("_")[1]) == 3)
and x[0] == "P"
for x in args.specific_videos
), 'A video ID is wrongly formatted. Please ensure all video IDs are of the form PXX_YY(Y).'
if args.errata:
args.what = tuple(w for w in args.what if w != 'consent_forms')
return args
if __name__ == '__main__':
assert sys.version_info.major == 3 and sys.version_info.minor >= 5, 'This script works with Python 3.5+. ' \
'Please use a more recent Python version'
parser = create_parser()
args = parse_args(parser)
print_header('*** Welcome to the EPIC Kitchens Downloader! ***')
downloader = EpicDownloader(base_output=args.output_path, errata_only=args.errata)
downloader.download(what=args.what, participants=args.participants, specific_videos=args.specific_videos, splits=args.splits, challenges=args.challenges,
extension_only=args.extension_only, epic55_only=args.epic55_only)
print_header('*** All done, bye! ***')