forked from OpenDataScotland/the_od_bods
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_data.py
465 lines (415 loc) · 19.2 KB
/
merge_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
# pylint: disable=E1101
### Setting the environment
import pandas as pd
import os
from datetime import datetime as dt
import datetime
import re
import json
def main():
### Loading data
# region CKAN
### From ckan output
print("Merging CKAN...")
source_ckan = pd.DataFrame()
folder = "data/ckan/"
for dirname, _, filenames in os.walk(folder):
for filename in filenames:
if filename.rsplit(".", 1)[1] == "csv":
print(filename)
source_ckan = pd.concat(
[
source_ckan,
pd.read_csv(
folder + r"/" + filename,
parse_dates=["DateCreated", "DateUpdated"],
lineterminator="\n",
),
]
)
source_ckan["Source"] = "ckan API"
# endregion
# region statistics.gov.scot
print("Merging statistics.gov.scot...")
### From scotgov csv
source_scotgov = pd.read_csv("data/scotgov-datasets-sparkql.csv")
source_scotgov = source_scotgov.rename(
columns={
"title": "Title",
"category": "OriginalTags",
"organization": "Owner",
"notes": "Description",
"date_created": "DateCreated",
"date_updated": "DateUpdated",
"url": "PageURL",
"licence": "License",
}
)
source_scotgov["Source"] = "sparql"
#print("DateUpdated " + source_scotgov["DateUpdated"])
#print("DateCreated " + source_scotgov["DateCreated"])
try:
source_scotgov["DateUpdated"] = pd.to_datetime(
source_scotgov["DateUpdated"], utc=True
).dt.tz_localize(None)
except:
try:
source_scotgov["DateUpdated"] = pd.to_datetime(
source_scotgov["DateUpdated"], utc=True, format="ISO8601"
).dt.tz_localize(None)
except:
# If we get to this stage, give up and just blank the date
print("WARNING: Failed to parse date - " + source_scotgov["DateUpdated"])
source_scotgov["DateUpdated"] = None
try:
source_scotgov["DateCreated"] = pd.to_datetime(
source_scotgov["DateCreated"], utc=True
).dt.tz_localize(None)
except:
try:
source_scotgov["DateCreated"] = pd.to_datetime(
source_scotgov["DateCreated"], utc=True, format="ISO8601"
).dt.tz_localize(None)
except:
# If we get to this stage, give up and just blank the date
print("WARNING: Failed to parse date - " + source_scotgov["DateCreated"])
source_scotgov["DateCreated"] = None
# endregion
# region ArcGIS
### From arcgis api
print("Merging ArcGIS...")
source_arcgis = pd.DataFrame()
folder = "data/arcgis/"
for dirname, _, filenames in os.walk(folder):
for filename in filenames:
if filename.rsplit(".", 1)[1] == "csv":
source_arcgis = pd.concat(
[
source_arcgis,
pd.read_csv(
folder + r"/" + filename,
parse_dates=["DateCreated", "DateUpdated"],
),
]
)
source_arcgis["Source"] = "arcgis API"
# endregion
# region uSmart
### From usmart api
print("Merging uSmart...")
source_usmart = pd.DataFrame()
folder = "data/USMART/"
for dirname, _, filenames in os.walk(folder):
for filename in filenames:
if filename.rsplit(".", 1)[1] == "csv":
source_usmart = pd.concat(
[
source_usmart,
pd.read_csv(
folder + r"/" + filename,
parse_dates=["DateCreated", "DateUpdated"],
),
]
)
source_usmart["Source"] = "USMART API"
source_usmart["DateUpdated"] = source_usmart["DateUpdated"].dt.tz_localize(None)
source_usmart["DateCreated"] = source_usmart["DateCreated"].dt.tz_localize(None)
# endregion
# region DCAT
## From DCAT
print("Merging DCAT...")
source_dcat = pd.DataFrame()
folder = "data/dcat/"
for dirname, _, filenames in os.walk(folder):
for filename in filenames:
if filename.rsplit(".", 1)[1] == "csv":
source_dcat = pd.concat(
[
source_dcat,
pd.read_csv(
folder + r"/" + filename,
parse_dates=["DateCreated", "DateUpdated"],
),
]
)
source_dcat["DateUpdated"] = source_dcat["DateUpdated"].dt.tz_localize(None)
# source_dcat["DateCreated"] = source_dcat["DateCreated"].dt.tz_localize(None) ### DateCreated currently not picked up in dcat so all are NULL
source_dcat["Source"] = "DCAT feed"
# endregion
# region Web scrapers
## From web scraped results
print("Merging web scraped results...")
source_scraped = pd.DataFrame()
folder = "data/scraped-results/"
for dirname, _, filenames in os.walk(folder):
for filename in filenames:
if filename.rsplit(".", 1)[1] == "csv":
print(f"\tMerging {filename}...")
source_scraped = pd.concat(
[
source_scraped,
pd.read_csv(
folder + r"/" + filename,
parse_dates=["DateCreated", "DateUpdated"],
),
]
)
# From Scottish Parliament
print("\tMerging Scottish Parliament...")
path = "data/bespoke_ScottishParliament/Scottish Parliament.json"
scottish_parliament_scraped = pd.read_json(path, convert_dates=["dateCreated", "dateUpdated"])
for index, row in scottish_parliament_scraped.iterrows():
resources = pd.json_normalize(row["resources"])
for resource_index, resource_row in resources.iterrows():
# TEMP FIX: Need to do this mapping until we modify the merged_output.json schema to support nesting resources inside each dataset entry
source_scraped = pd.concat(
[source_scraped, pd.DataFrame.from_records([{"Title": row["title"], "Owner": row["owner"], "PageURL": row["pageURL"], "AssetURL": resource_row["assetUrl"], "DateCreated": row["dateCreated"], "DateUpdated": row["dateUpdated"], "FileSize": resource_row["fileSize"], "FileType": resource_row["fileType"], "NumRecords": resource_row["numRecords"], "OriginalTags": row["tags"], "ManualTags" : row["tags"], "License": row["licence"], "Description": row["description"], "FileName": resource_row["fileName"]}])]
)
source_scraped["Source"] = "Web Scraped"
# endregion
### Combine all data into single table
print("Concatenating all")
data = pd.concat(
[
source_ckan,
source_arcgis,
source_usmart,
source_scotgov,
source_dcat,
source_scraped,
]
)
data = data.reset_index(drop=True)
print(f"Output untidy {dt.now()}")
### Saves copy of data without cleaning - for analysis purposes
data.to_json("data/merged_output_untidy.json", orient="records", date_format="iso")
### clean data
# TODO: Cleaning data per dataset file is massively inefficient as we're often applying the same operations to duplicate row values (e.g. 1 dataset x 5 files == 5 cleaning attempts for the same license data etc.)
print(f"Cleaning data {dt.now()}")
data = clean_data(data)
### Output cleaned data to json
print(f"Output cleaned {dt.now()}")
data.to_json("data/merged_output.json", orient="records", date_format="iso")
return data
def clean_data(dataframe):
"""cleans data in a dataframe
Args:
dataframe (pd.dataframe): the name of the dataframe of data to clean
Returns:
dataframe: dataframe of cleaned data
"""
### to avoid confusion and avoid re-naming everything...
data = dataframe
### Renaming entries to match
# TODO: Move this to a JSON file instead of hardcoding
owner_renames = {
"Aberdeen": "Aberdeen City Council",
"Dundee": "Dundee City Council",
"Perth": "Perth & Kinross Council",
"Perth and Kinross Council": "Perth & Kinross Council",
"Stirling": "Stirling Council",
"Angus": "Angus Council",
"open.data@southayrshire": "South Ayrshire Council",
"SEPA": "Scottish Environment Protection Agency",
"South Ayrshire": "South Ayrshire Council",
"East Ayrshire": "East Ayrshire Council",
"Highland Council GIS Organisation": "Highland Council",
"Scottish.Forestry": "Scottish Forestry",
"Na h-Eileanan an Iar": "Comhairle nan Eilean Siar",
"National Records Scotland": "National Records of Scotland",
"Development, Safety and Regulation": "South Ayrshire Council", # TEMP fix
"Stirling Council - insights by location": "Stirling Council",
"Aberdeen City Council ArcGIS Online": "Aberdeen City Council",
"City of Edinburgh Council Mapping": "City of Edinburgh Council",
"Cairngorms National Park": "Cairngorms National Park Authority",
"Loch Lomond and The Trossachs National Park": "Loch Lomond and The Trossachs National Park Authority",
"Drinking Water Quality Regulator (DWQR)": "Drinking Water Quality Regulator",
"DCC Public GIS Portal": "Dundee City Council",
}
data["Owner"] = data["Owner"].replace(owner_renames)
### Format dates as datetime type
data["DateCreated"] = pd.to_datetime(
data["DateCreated"], format="%Y-%m-%d", errors="coerce", utc=True
).dt.date
data["DateUpdated"] = pd.to_datetime(
data["DateUpdated"], format="%Y-%m-%d", errors="coerce", utc=True
).dt.date
### Inconsistencies in casing for FileType
data["FileType"] = data["FileType"].str.upper()
### Creating a dummy column
data["AssetStatus"] = None
### Cleaning dataset categories
def tidy_categories(categories_string):
"""tidies the categories: removes commas, strips whitespace, converts all to lower and strips any trailing ";"
Args:
categories_string (string): the dataset categories as a string
"""
tidied_string = str(categories_string).replace(",", ";")
tidied_list = [
cat.lower().strip() for cat in tidied_string.split(";") if cat != ""
]
tidied_string = ";".join(str(cat) for cat in tidied_list if str(cat) != "nan")
if len(tidied_string) > 0:
if tidied_string[-1] == ";":
tidied_string = tidied_string[:-1]
return tidied_string
### Tidy tag columns
data["OriginalTags"] = data["OriginalTags"].apply(tidy_categories)
data["ManualTags"] = data["ManualTags"].apply(tidy_categories)
trailing_s_pattern = re.compile("[Ss]$")
### Creating dataset categories for ODS
def remove_trailing_s(string):
"""Remove trailing 's' from all words in string to remove requirement to search for plural categories in
Args:
string: String to remove trialing 's' from
Returns:
string: the resulting string, with trailing 's' removed from all words.
"""
words = string.split()
s = [trailing_s_pattern.sub("", word) for word in words]
sentence = " ".join(s)
return sentence
def find_keyword(str_tofind, str_findin):
"""Finds if single word or phrase exists in string
Args:
str_tofind (str): the word or phrase to find
str_findin (str): the body of text to search in
Returns:
boolean: True if match is found
"""
str_findin = remove_trailing_s(str_findin)
str_tofind = remove_trailing_s(str_tofind)
if re.search(r"\b" + re.escape(str_tofind) + r"\b", str_findin, re.I):
return True
return False
def match_categories(str_tocategorise):
"""Cycles through keywords and keyphrases to check if used in body of text
Args:
str_tocategorise (str): body of text to search in
Returns:
list: the resulting categories as a string, as well as a dictionary of the keyphrases which resulted in a category
"""
category_dict = {}
for category in ods_categories.keys():
keyword_list = []
for keyword in ods_categories[category]:
if find_keyword(keyword, str_tocategorise):
keyword_list.append(keyword)
category_dict[category] = keyword_list
if len(category_dict) == 0:
category_list = "Uncategorised"
else:
category_list = ";".join(list(category_dict.keys()))
return [category_list, category_dict]
def get_categories(row_index):
"""combines title and description together to then search for keyword or keyphrase in
Args:
row_index (pandas df row): a single row in a pandas dataframe to check. Must have columns "Title" and "Description"
Returns:
list: the resulting categories as a string, as well as a dictionary of the keyphrases which resulted in a category
"""
str_title_description = (
str(row_index["Title"]) + " " + str(row_index["Description"])
)
categories_result = match_categories(str_title_description)
return categories_result
with open("ODSCategories.json") as json_file:
ods_categories = json.load(json_file)
### Apply ODS categorisation
data[["ODSCategories", "ODSCategories_Keywords"]] = data.apply(
lambda x: get_categories(x), result_type="expand", axis=1
)
### Tidy licence names
def tidy_licence(licence_name):
"""Temporary licence conversion to match export2jkan -- FOR ANALYTICS ONLY, will discard in 2022Q2 Milestone
Returns:
string: a tidied licence name
"""
known_licences = {
"https://creativecommons.org/licenses/by-sa/3.0/": "Creative Commons Attribution Share-Alike 3.0",
"https://creativecommons.org/licenses/by/4.0/legalcode": "Creative Commons Attribution 4.0 International",
"https://creativecommons.org/licenses/by/4.0": "Creative Commons Attribution 4.0 International",
"http://creativecommons.org/licenses/by-sa/3.0/": "Creative Commons Attribution Share-Alike 3.0",
"http://creativecommons.org/licenses/by/4.0/legalcode": "Creative Commons Attribution 4.0 International",
"http://creativecommons.org/licenses/by/4.0": "Creative Commons Attribution 4.0 International",
"Creative Commons Attribution 4.0": "Creative Commons Attribution 4.0 International",
"https://creativecommons.org/share-your-work/public-domain/cc0": "Creative Commons CC0",
"https://rightsstatements.org/page/NoC-NC/1.0/": "Non-Commercial Use Only",
"https://opendatacommons.org/licenses/odbl/1-0/": "Open Data Commons Open Database License 1.0",
"http://creativecommons.org/share-your-work/public-domain/cc0": "Creative Commons CC0",
"http://rightsstatements.org/page/NoC-NC/1.0/": "Non-Commercial Use Only",
"http://opendatacommons.org/licenses/odbl/1-0/": "Open Data Commons Open Database License 1.0",
"Open Data Commons Open Database License 1.0": "Open Data Commons Open Database License 1.0",
"https://www.nationalarchives.gov.uk/doc/open-government-licence/version/2/": "Open Government Licence v2.0",
"https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/": "Open Government Licence v3.0",
"http://www.nationalarchives.gov.uk/doc/open-government-licence/version/2/": "Open Government Licence v2.0",
"http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/": "Open Government Licence v3.0",
"Open Government Licence 3.0 (United Kingdom)": "Open Government Licence v3.0",
"UK Open Government Licence (OGL)": "Open Government Licence v3.0",
"Open Government": "Open Government Licence v3.0",
"uk-ogl": "Open Government Licence v3.0",
"OGL3": "Open Government Licence v3.0",
"https://rightsstatements.org/vocab/NKC/1.0/": "No Known Copyright",
"https://creativecommons.org/publicdomain/mark/1.0/": "Public Domain",
"http://rightsstatements.org/vocab/NKC/1.0/": "No Known Copyright",
"http://creativecommons.org/publicdomain/mark/1.0/": "Public Domain",
"Other (Public Domain)": "Public Domain",
"Public Domain": "Public Domain",
"Public Sector End User Licence (Scotland)": "Public Sector End User Licence (Scotland)",
"Scottish Parliament Copyright Policy": "Scottish Parliament Copyright Policy"
}
for key in known_licences.keys():
if str(licence_name).lower().strip(" /") == key.lower().strip(" /"):
return known_licences[key]
if str(licence_name) == "nan":
tidied_licence = "No licence"
else:
tidied_licence = "Custom licence: " + str(licence_name)
return tidied_licence
data["License"] = data["License"].apply(tidy_licence)
def tidy_file_type(file_type):
"""Temporary data type conversion
Args:
file_type (str): the data type name
Returns:
tidied_file_type (str): a tidied data type name
"""
file_types_to_tidy = {
"application/x-7z-compressed": "7-Zip compressed file",
"ArcGIS GeoServices REST API": "ARCGIS GEOSERVICE",
"Esri REST": "ARCGIS GEOSERVICE",
"Atom Feed": "ATOM FEED",
"htm": "HTML",
"ics": "iCalendar",
"jpeg": "Image",
"vnd.openxmlformats-officedocument.spreadsheetml.sheet": "MS EXCEL",
"vnd.ms-excel": "MS EXCEL",
"xls": "MS EXCEL",
"xlsx": "MS EXCEL",
"doc": "MS Word",
"docx": "MS Word",
"QGIS": "QGIS Shapefile",
"text": "TXT",
"web": "URL",
"UK/DATA/#TABGB1900": "URL",
"UK/ROY/GAZETTEER/#DOWNLOAD": "URL",
"Web Mapping Application": "WEB MAP",
"mets": "XML",
"alto": "XML",
}
tidied_data_type = "NULL"
for key in file_types_to_tidy.keys():
if str(file_type).lower().strip(". /") == key.lower().strip(". /"):
tidied_file_type = file_types_to_tidy[key]
return tidied_file_type
if str(file_type) == "nan" or str(file_type) == "":
tidied_file_type = "No file type"
else:
# print("file type: ", file_type)
tidied_file_type = str(file_type).strip(". /").upper()
return tidied_file_type
### Inconsistencies in casing for FileType
data["FileType"] = data["FileType"].apply(tidy_file_type)
return data
if __name__ == "__main__":
main()