-
Notifications
You must be signed in to change notification settings - Fork 0
/
grab_urls.py
104 lines (88 loc) · 3.61 KB
/
grab_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import sys
import shutil
import json
from os.path import dirname, exists, join
from os import makedirs, getenv, chdir
from time import sleep
import requests
def request_with_backoff(*args, attempts=100, backoff=0.1, **kwargs):
for attempt in range(1, attempts + 1):
# exponential backoff
wait = (pow(2, attempt) - 1) * backoff
try:
result = requests.request(*args, **kwargs)
if result.status_code == 200:
sleep(wait)
return result
except requests.exceptions.ConnectionError:
pass
print(f'Error! Retrying after {wait} seconds')
sleep(wait)
raise Exception(f'Failed after {attempts} attempts. Giving up.')
def main(args):
cache = '--cache' in args
skip_metadata = '--skip-metadata' in args
if skip_metadata:
print('Skipping metadata')
# Loop through each page and save the URL end-points of the data files
# You may need to set up an empty directory called "urls"
print('Fetching:')
tmpl = 'curl -L -k -f -s ' + \
'-H "Accept: application/xhtml+xml, application/xml,*/*;q=0.9" ' + \
'--retry 4 --retry-delay 10 -y 30 ' + \
'-Y 1000 -A "IATI data dump 1.0" ' + \
'--create-dirs -o data/{publisher_id}/{dataset_name}.xml ' + \
'"{url}" 2>&1 >/dev/null ; exitcode=$? ; ' + \
'test "$exitcode" != 0 && ' + \
'echo $exitcode {publisher_id} {dataset_name} "{url}" > ' + \
'logs/{dataset_name}.log && touch ' + \
'data/{publisher_id}/{dataset_name}.xml\n'
datasets = requests.get(
"https://registry.codeforiati.org/dataset_list.json").json()["result"]
publishers = requests.get(
"https://registry.codeforiati.org/publisher_list.json").json()["result"]
publishers = {
publisher["name"]: publisher
for publisher in publishers
}
for dataset in datasets:
organization = dataset['organization']
if dataset['resources'] == [] or not organization:
continue
dataset_name = dataset['name']
url = dataset['resources'][0]['url']
publisher_id = organization["name"]
if cache:
filename = join(publisher_id, dataset_name + '.xml')
cache_file = join('cache', filename)
if exists(cache_file):
out_file = join('data', filename)
out_path = dirname(out_file)
if not exists(out_path):
makedirs(out_path, exist_ok=True)
shutil.move(cache_file, out_file)
if not skip_metadata:
metadata_filepath = f'metadata/{publisher_id}'
if not exists(metadata_filepath):
makedirs(metadata_filepath)
publisher_metadata = publishers[publisher_id]
publisher_metadata_file = f'{metadata_filepath}.json'
with open(publisher_metadata_file, 'w') as f:
json.dump(publisher_metadata, f)
metadata_file = f'{metadata_filepath}/{dataset_name}.json'
with open(metadata_file, 'w') as f:
json.dump(dataset, f)
with open(f'urls/{publisher_id}', 'a') as f:
f.write(f'{dataset_name} {url}\n')
output = tmpl.format(
publisher_id=publisher_id,
dataset_name=dataset_name,
url=url.replace(' ', '%20'),
)
with open('downloads.curl', 'a') as f:
f.write(output)
if __name__ == '__main__':
working_dir = getenv('GRAB_URLS_WORKING_DIR')
if working_dir:
chdir(working_dir)
main(sys.argv[1:])