-
Notifications
You must be signed in to change notification settings - Fork 0
/
json2oai.py
111 lines (96 loc) · 3.78 KB
/
json2oai.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python3
import xmltodict
import os
import json
from datetime import datetime
JSON_DIR = "data"
metadata_format = "oai_dc"
current_datetime = datetime.now().isoformat()
def read_jsons(directory = JSON_DIR):
"""read all json files in the /data directory and return a list of json objects"""
jsons = {}
for filename in os.listdir(directory):
if filename.endswith(".json"):
filepath = os.path.join(directory, filename)
with open(filepath) as file:
jsons[filename[:-5]] = json.load(file)
return jsons
def prepare_oai_dict():
"""prepare the oai response dictionary"""
oai_dict = {
'OAI-PMH': {
'@xmlns': 'http://www.openarchives.org/OAI/2.0/',
'@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
'@xsi:schemaLocation': 'http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd',
'responseDate': current_datetime,
'request': {
'@verb': 'ListRecords',
'@metadataPrefix': metadata_format,
'#text': 'https://dataverse.harvard.edu/oai'},
'ListRecords': {
'record': []
}
}
}
return oai_dict
def prepare_dc_json(doi, json):
"""Extract Dublin Core metadata from the json object and add it to the oai response"""
#TODO map non-dc metadata to dublin core, verify which fields exist in Dataverse
dc_json = {}
identifier = None
for key, value in json.items():
key = key.lower().replace(".",":")
if key.startswith("dc:"):
dc_json[key] = value
#FIND IDENTIFIER AS URL OR DOI
if key.startswith("dc:identifier"):
if type(value) is str:
value = [value]
for v in value:
if key.startswith("dc:identifier:doi"):
if key.startswith("doi:"):
identifier = "http://doi.org/" + v.split("doi:")[1]
else:
identifier = "http://doi.org/" + v
elif key.startswith("dc:identifier:uri"):
if key.startswith("http"):
identifier = key
elif key == "dc:identifier":
if key.startswith("doi"):
identifier = "http://doi.org/" + v.split("doi:")[1]
elif key.startswith("http"):
identifier = key
if identifier is None:
print (f"DOI {doi} has no valid identifier")
return {}
if len(dc_json) == 0:
print(f"DOI {doi} has no dc-prefixed metadata")
return {}
oai_dict = {
'header': {
'identifier': identifier,
'datestamp': current_datetime,
},
'metadata': {
'oai_dc:dc': {
'@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
'@xmlns:oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
'@xmlns:dc': 'http://purl.org/dc/elements/1.1/',
'@xsi:schemaLocation': 'http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd',
}
}
}
oai_dict['metadata']['oai_dc:dc'].update(dc_json)
return oai_dict
def main():
"""reads json citation data and writes it to an XML file in OAI-PMH format"""
oai_dict = prepare_oai_dict()
jsons = read_jsons()
for name, data in jsons.items():
# TODO replace name from filename to doi
oai_record = prepare_dc_json(name, data)
oai_dict['OAI-PMH']['ListRecords']['record'].append(oai_record)
with open("outputs/oai", "w") as file:
file.write(xmltodict.unparse(oai_dict, pretty=True))
if __name__ == "__main__":
main()