-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_files.py
63 lines (52 loc) · 2.08 KB
/
find_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import csv
import requests
from itertools import chain
from time import sleep
def is_image(file):
return file['hasMimeType'] == 'image/jp2'
def get_filename(file):
return file['filename']
def get_images(fileset):
return list(map(get_filename, filter(is_image, fileset['structural']['contains'])))
def get_files(func, druid):
max_retries = 3
retry_count = 0
backoff_factor = 2 # Multiplier for increasing wait time between retries
while retry_count < max_retries:
try:
url = f'https://purl.stanford.edu/{druid}.json'
response = requests.get(url, timeout=5)
response.raise_for_status() # Raise an error for bad status codes (e.g., 4xx, 5xx)
json_data = response.json()
images = list(chain.from_iterable(map(get_images, json_data['structural']['contains'])))
for image in images:
func(image)
break # exit loop on success
except requests.exceptions.Timeout:
retry_count += 1
print(f"Timeout occurred. Attempt {retry_count} of {max_retries}.")
if retry_count == max_retries:
print("Max retries reached. Unable to complete the request due to repeated timeouts.")
break
# Wait before retrying again (with exponential backoff)
sleep(backoff_factor ** retry_count)
except requests.exceptions.HTTPError as err:
print(f"HTTP error occurred for {druid} {err}")
break # exit loop
except requests.exceptions.RequestException as e:
print(f"Error occurred: {druid} {e}")
break # exit loop
def harvest_files(start_at=0):
with open('report.csv', mode='r') as file:
csv_reader = csv.reader(file)
next(csv_reader) # skip headers
with open('out.csv', mode='a') as outfile:
csv_writer = csv.writer(outfile)
if start_at == 0:
csv_writer.writerow(['Druid', 'Filename'])
for row_index, row in enumerate(csv_reader):
if row_index >= start_at:
druid = row[0]
print(row_index)
get_files(lambda image: csv_writer.writerow([druid, image]), druid)
harvest_files(start_at=0)