-
Notifications
You must be signed in to change notification settings - Fork 4
/
harvest_yoda_datacite_to_ricgraph.py
564 lines (508 loc) · 26.6 KB
/
harvest_yoda_datacite_to_ricgraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
# ########################################################################
#
# Ricgraph - Research in context graph
#
# ########################################################################
#
# MIT License
#
# Copyright (c) 2023 Rik D.T. Janssen
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# ########################################################################
#
# This file contains example code for Ricgraph.
#
# With this code, you can harvest datasets from Yoda, using Datacite, with the OAI-PMH protocol.
# You have to set some parameters in ricgraph.ini.
# Also, you can set a number of parameters in the code following the "import" statements below.
#
# Original version Rik D.T. Janssen, December 2022.
# Updated Rik D.T. Janssen, April, October 2023.
#
# ########################################################################
#
# Usage
# harvest_yoda_datacite_to_ricgraph.py [options]
#
# Options:
# --empty_ricgraph <yes|no>
# 'yes': Ricgraph will be emptied before harvesting.
# 'no': Ricgraph will not be emptied before harvesting.
# If this option is not present, the script will prompt the user
# what to do.
#
# ########################################################################
import os.path
import sys
import pandas
import xmltodict
from sickle import Sickle
import configparser
import ricgraph as rcg
YODA_HARVEST_FILENAME = 'yoda_datacite_harvest.xml'
YODA_DATA_FILENAME = 'yoda_datacite_data.csv'
YODA_HEADERS = {'metadataPrefix': 'oai_datacite',
'ignore_deleted': True
}
global YODA_URL
# ######################################################
# Mapping from Yoda Datacite research output types to Ricgraph research output types.
# ######################################################
ROTYPE_MAPPING_YODA = {
'Research Data': rcg.ROTYPE_DATASET,
'Method Description': rcg.ROTYPE_METHOD_DESCRIPTION,
'Model': rcg.ROTYPE_MODEL,
'Computer code': rcg.ROTYPE_SOFTWARE,
'Other Document': rcg.ROTYPE_OTHER_CONTRIBUTION
}
# ######################################################
# Utility functions related to harvesting of Yoda
# ######################################################
# ######################################################
# Parsing
# ######################################################
def flatten_row(full_record: dict, dict_with_one_name: dict) -> dict:
"""The purpose of this function is, given a 'full_record', to
flatten it. That means: most records have more than one name,
sometimes as contributor, sometimes as creator.
Full_records which have this, will be split into multiple records,
where every record will have one of these names.
Creator is a separate field, but I also include it in the list of contributors.
:param full_record: see description above.
:param dict_with_one_name: the dictionary with one name.
:return: a dictionary with several parsed records.
"""
new_record = {}
for item_in_full_record in full_record:
key = item_in_full_record
value = full_record[item_in_full_record]
if item_in_full_record == 'titles':
# I pick the first one
value = value['title']
if item_in_full_record == 'identifier':
key = value['@identifierType']
value = value['#text']
if item_in_full_record == 'resourceType':
value = value['#text']
if item_in_full_record == 'descriptions':
description = value['description']
value = description['#text']
# This code was originally written to parse all elements. For Ricgraph, this is not necessary.
#
# if item_in_full_record == 'fundingReferences':
# if value is None:
# continue
# fundingref = value['fundingReference']
# counter = 0
# for fundingorgs in fundingref:
# counter = counter + 1
# if fundingorgs == 'funderName':
# # This is necessary, since there is only 1 dict
# key = fundingref['funderName']
# if 'awardNumber' in fundingref:
# value = fundingref['awardNumber']
# else:
# value = 'no award number for ' + key
# new_record['funderName_' + str(counter)] = key
# new_record['funderAwardNumber_' + str(counter)] = value
# break
#
# key = fundingorgs['funderName']
# if 'awardNumber' in fundingorgs:
# value = fundingorgs['awardNumber']
# else:
# value = 'no award number for ' + key
# new_record['funderName_' + str(counter)] = key
# new_record['funderAwardNumber_' + str(counter)] = value
# # if item_in_full_record == 'subjects':
# if value is None:
# continue
# subjectref = value['subject']
# keywords = ''
# oecd_fos_2007s = ''
# for subject in subjectref:
# if subject == '@subjectScheme':
# # This is necessary, since there is only 1 dict
# if subjectref['@subjectScheme'] == 'Keyword':
# new_record['keywords'] = subjectref['#text']
# if subjectref['@subjectScheme'] == 'OECD FOS 2007':
# new_record['oecd_fos_2007s'] = subjectref['#text']
# break
#
# if subject['@subjectScheme'] == 'Keyword':
# keywords = keywords + '||' + subject['#text']
# if subject['@subjectScheme'] == 'OECD FOS 2007':
# oecd_fos_2007s = oecd_fos_2007s + '||' + subject['#text']
# if keywords != '':
# new_record['keywords'] = keywords[2:]
# if oecd_fos_2007s != '':
# new_record['oecd_fos_2007s'] = oecd_fos_2007s[2:]
#
# end of: This code was originally written to parse all elements.
if item_in_full_record == 'creators' \
or item_in_full_record == 'contributors':
# This is done below
continue
if item_in_full_record != 'fundingReferences':
# Prevent setting it twice for fundingReferences
new_record[key] = value
# write name and all its identifiers
for item_in_names in dict_with_one_name:
if item_in_names == 'nameIdentifier':
# Separate the various nameIdentifiers (like SCOPUS_ID, ORCID, etc.)
name_identifiers = dict_with_one_name['nameIdentifier']
for identifier in name_identifiers:
if identifier == '@nameIdentifierScheme':
# This is necessary, since there is only 1 dict
key = name_identifiers['@nameIdentifierScheme']
value = name_identifiers['#text']
new_record[key] = value
break
key = identifier['@nameIdentifierScheme']
value = identifier['#text']
new_record[key] = value
elif item_in_names == 'affiliation':
affiliation_identifiers = dict_with_one_name['affiliation']
for identifier in affiliation_identifiers:
if identifier == '@affiliationIdentifierScheme':
# This one is to catch XML lines like
# <affiliation affiliationIdentifier="https://ror.org/04pp8hn57"
# affiliationIdentifierScheme="ROR">Utrecht University</affiliation>
key = item_in_names
value = affiliation_identifiers['#text']
new_record[key] = value
break
# This one is to catch XML lines like
# <affiliation>Utrecht University</affiliation>
key = item_in_names
value = dict_with_one_name[key]
if isinstance(value, dict):
# A person is from multiple organizations.
value = value['#text']
if isinstance(value, list):
if isinstance(value[0], dict):
# A person is from multiple organizations.
newvalue = []
for item in value:
newvalue.append(item['#text'])
value = newvalue.copy()
new_record[key] = value
else:
# All other name items
key = item_in_names
value = dict_with_one_name[key]
new_record[key] = value
if key == 'creatorName' or key == 'contributorName':
# Also insert it in another column, for convenience
if not isinstance(value, str):
# Sometimes it is an OrderedDict
new_record['contributorName'] = value['#text']
else:
new_record['contributorName'] = value
new_record['@contributorType'] = 'Creator'
return new_record
def parse_yoda_datacite(harvest: dict) -> pandas.DataFrame:
"""Parse the harvested datasets (and other research outputs) from Yoda datacite.
:param harvest: the harvest.
:return: the harvested research outputs in a DataFrame.
"""
list_of_records = harvest['makewellformedxml']['record']
rowdict = []
for item in list_of_records:
record = item['metadata']['oai_datacite']
record = record['payload']['resource']
for item_in_record in record:
if item_in_record == 'creators':
# Make sure that all 'creators' will also be in 'contributors'
name_elements = record[item_in_record]['creator']
for name_item in name_elements:
if name_item == 'creatorName':
# This is necessary, since there is only 1 dict
newdict = flatten_row(record, name_elements)
rowdict.append(newdict)
break
newdict = flatten_row(record, name_item)
rowdict.append(newdict)
if item_in_record == 'contributors':
name_elements = record[item_in_record]['contributor']
for name_item in name_elements:
if name_item == '@contributorType':
# This is necessary, since there is only 1 dict
newdict = flatten_row(record, name_elements)
rowdict.append(newdict)
break
newdict = flatten_row(record, name_item)
rowdict.append(newdict)
datacite_data = pandas.DataFrame(rowdict)
# In column "affiliation", sometimes there is a string (when someone has one affiliation),
# and sometimes a list of strings (when someone has more than one affiliation).
# First, convert any string values in that column to single-element lists.
# This ensures that all values in the column are list-like.
datacite_data['affiliation'] = datacite_data['affiliation'].apply(lambda x: [x] if isinstance(x, str) else x)
# Then, use the explode() function on the that column.
# This will create a new row for each element in the lists, while keeping the other column values the same.
# Finally, a reset_index(drop=True) is used to create a new unique index for all rows.
datacite_data = datacite_data.explode('affiliation').reset_index(drop=True)
datacite_data['DOI_TYPE'] = datacite_data[['resourceType']].apply(
lambda row: rcg.lookup_resout_type(research_output_type=row['resourceType'],
research_output_mapping=ROTYPE_MAPPING_YODA), axis=1)
datacite_data['DOI'] = datacite_data['DOI'].str.lower()
# The next two statements will result in an 'behaviour will change in pandas 3.0' warning.
# datacite_data['ORCID'].replace(regex=r'[a-z/:_. ]*', value='', inplace=True)
# datacite_data['ISNI'].replace(regex=r'[ ]*', value='', inplace=True)
datacite_data['ORCID'] = datacite_data['ORCID'].replace(regex=r'[a-z/:_. ]*', value='')
datacite_data['ISNI'] = datacite_data['ISNI'].replace(regex=r'[ ]*', value='')
yoda_data = datacite_data[['DOI', 'contributorName', 'DAI',
'ORCID', 'Author identifier (Scopus)',
'ISNI', 'ResearcherID (Web of Science)',
'titles', 'DOI_TYPE', 'publicationYear',
'affiliation'
]].copy(deep=True)
# This does not seem to work:
# yoda_data.drop_duplicates(keep='first', inplace=True, ignore_index=True)
yoda_data.rename(columns={'DAI': 'DIGITAL_AUTHOR_ID',
'Author identifier (Scopus)': 'SCOPUS_AUTHOR_ID',
'ResearcherID (Web of Science)': 'RESEARCHER_ID',
'contributorName': 'FULL_NAME',
'titles': 'TITLE',
'affiliation': 'ORGANIZATION_NAME'
}, inplace=True)
return yoda_data
# ######################################################
# Harvesting and parsing
# ######################################################
# Inspiration for this function is from
# https://christinakouridi.blog/2019/06/16/harvesting-metadata-of-1-5million-arxiv-papers.
def harvest_xml_and_write_to_file(url: str, headers: dict, harvest_filename: str) -> None:
"""Harvest and parse XML data from Yoda datacite.
Get the data from an OAI-PMH endpoint. Save it to file.
Best is to use "oai_datacite" protocol, not "datacite",
see https://support.datacite.org/docs/datacite-oai-pmh.
:param url: URL to harvest from
:param headers: headers for harvest.
:param harvest_filename: filename to write harvest results to.
:return: the DataFrame harvested, or None if nothing harvested.
"""
# Sickle takes care of resumptionToken if present, our source does not deliver all records in one xml
print('Getting data from ' + url + '... ', end='')
connection = Sickle(url)
data = connection.ListRecords(**headers)
print('Done.')
iters = errors = 0
print('Saving data to ' + harvest_filename + '...')
print('Getting record: 0 ', end='')
# Sickle doesn't deliver wellformed xml, so we place tags around it
with open(harvest_filename, 'w') as f:
f.write('<makewellformedxml>')
with open(harvest_filename, 'a+') as f:
while True:
try:
f.write(data.next().raw)
iters += 1
if iters % 50 == 0:
print(iters, " ", end='', flush=True)
except AttributeError:
if errors > 5:
raise AttributeError('\nharvest_xml_and_write_to_file(): Quitting, too many errors.\n')
else:
print('\nharvest_xml_and_write_to_file(): some AttributeError occurred.\n')
errors += 1
except StopIteration:
print(iters, " Done.", flush=True)
break
with open(harvest_filename, 'a+') as f:
f.write('</makewellformedxml>')
return
def harvest_and_parse_yoda_datacite_data(url: str, headers: dict, harvest_filename: str) -> pandas.DataFrame:
"""Harvest and parse data from Yoda datacite.
:param url: URL to harvest from.
:param headers: headers for harvest.
:param harvest_filename: filename to write harvest results to.
:return: the DataFrame harvested, or None if nothing harvested.
"""
harvest_xml_and_write_to_file(url=url,
headers=headers,
harvest_filename=harvest_filename)
with open(harvest_filename) as fd:
doc = xmltodict.parse(fd.read())
parse = parse_yoda_datacite(harvest=doc)
print('The harvested data from Yoda datacite are:')
print(parse)
return parse
# ######################################################
# Parsed results to Ricgraph
# ######################################################
def parsed_yoda_datacite_to_ricgraph(parsed_content: pandas.DataFrame) -> None:
"""Insert the parsed datasets, etc. in Ricgraph.
:param parsed_content: The records to insert in Ricgraph, if not present yet.
:return: None.
"""
timestamp = rcg.datetimestamp()
print('Inserting datasets from Yoda (harvested from datacite) in Ricgraph at '
+ timestamp + '...')
history_event = 'Source: Harvest Yoda-datacite at ' + timestamp + '.'
# The order of the columns in the DataFrame below is not random.
# A good choice is to have in the first two columns:
# a. the identifier that appears the most in the system we harvest.
# b. the identifier(s) that is already present in Ricgraph from previous harvests,
# since new identifiers from this harvest will be linked to an already existing
# person-root.
# If you have 2 of type (b), use these as the first 2 columns.
person_identifiers = parsed_content[['ORCID', 'SCOPUS_AUTHOR_ID',
'FULL_NAME', 'DIGITAL_AUTHOR_ID',
'ISNI', 'RESEARCHER_ID']].copy(deep=True)
# dropna(how='all'): drop row if all row values contain NaN
person_identifiers.dropna(axis=0, how='all', inplace=True)
person_identifiers.drop_duplicates(keep='first', inplace=True, ignore_index=True)
print('The following persons from Yoda will be inserted in Ricgraph:')
print(person_identifiers)
rcg.unify_personal_identifiers(personal_identifiers=person_identifiers,
source_event='Yoda-DataCite',
history_event=history_event)
# We need to connect a dataset to a person-root. However, there is no single
# person identifier that every person has. So we will need to connect DOIs to
# every person-identifier we have.
print('The following datasets (DOIs) from Yoda will be inserted in Ricgraph:')
print(parsed_content)
print('\nAdding DOIs and ORCIDs at ' + rcg.timestamp() + '...')
rcg.create_nodepairs_and_edges_params(name1='DOI',
category1=parsed_content['DOI_TYPE'],
value1=parsed_content['DOI'],
comment1=parsed_content['TITLE'],
year1=parsed_content['publicationYear'],
source_event1='Yoda-DataCite',
history_event1=history_event,
name2='ORCID',
category2='person',
value2=parsed_content['ORCID'])
# Don't need to add a source_event to the following calls, since we have already inserted
# each source above, here we are connecting them.
print('\nAdding DOIs and DIGITAL_AUTHOR_IDs at ' + rcg.timestamp() + '...')
rcg.create_nodepairs_and_edges_params(name1='DOI',
category1=parsed_content['DOI_TYPE'],
value1=parsed_content['DOI'],
name2='DIGITAL_AUTHOR_ID',
category2='person',
value2=parsed_content['DIGITAL_AUTHOR_ID'])
print('\nAdding DOIs and SCOPUS_AUTHOR_IDs at ' + rcg.timestamp() + '...')
rcg.create_nodepairs_and_edges_params(name1='DOI',
category1=parsed_content['DOI_TYPE'],
value1=parsed_content['DOI'],
name2='SCOPUS_AUTHOR_ID',
category2='person',
value2=parsed_content['SCOPUS_AUTHOR_ID'])
print('\nAdding DOIs and RESEARCHER_IDs at ' + rcg.timestamp() + '...')
rcg.create_nodepairs_and_edges_params(name1='DOI',
category1=parsed_content['DOI_TYPE'],
value1=parsed_content['DOI'],
name2='RESEARCHER_ID',
category2='person',
value2=parsed_content['RESEARCHER_ID'])
print('\nAdding DOIs and ISNIs at ' + rcg.timestamp() + '...')
rcg.create_nodepairs_and_edges_params(name1='DOI',
category1=parsed_content['DOI_TYPE'],
value1=parsed_content['DOI'],
name2='ISNI',
category2='person',
value2=parsed_content['ISNI'])
# Same for organizations.
print('The following organizations from person from Yoda will be inserted in Ricgraph:')
print(parsed_content)
print('\nAdding organizations and ORCIDs at ' + rcg.timestamp() + '...')
rcg.create_nodepairs_and_edges_params(name1='ORGANIZATION_NAME',
category1='organization',
value1=parsed_content['ORGANIZATION_NAME'],
source_event1='Yoda-DataCite',
history_event1=history_event,
name2='ORCID',
category2='person',
value2=parsed_content['ORCID'])
# Don't need to add a source_event to the following calls, since we have already inserted
# each source above, here we are connecting them.
print('\nAdding organizations and DIGITAL_AUTHOR_IDs at ' + rcg.timestamp() + '...')
rcg.create_nodepairs_and_edges_params(name1='ORGANIZATION_NAME',
category1='organization',
value1=parsed_content['ORGANIZATION_NAME'],
name2='DIGITAL_AUTHOR_ID',
category2='person',
value2=parsed_content['DIGITAL_AUTHOR_ID'])
print('\nAdding organizations and SCOPUS_AUTHOR_IDs at ' + rcg.timestamp() + '...')
rcg.create_nodepairs_and_edges_params(name1='ORGANIZATION_NAME',
category1='organization',
value1=parsed_content['ORGANIZATION_NAME'],
name2='SCOPUS_AUTHOR_ID',
category2='person',
value2=parsed_content['SCOPUS_AUTHOR_ID'])
print('\nAdding organizations and RESEARCHER_IDs at ' + rcg.timestamp() + '...')
rcg.create_nodepairs_and_edges_params(name1='ORGANIZATION_NAME',
category1='organization',
value1=parsed_content['ORGANIZATION_NAME'],
name2='RESEARCHER_ID',
category2='person',
value2=parsed_content['RESEARCHER_ID'])
print('\nAdding organizations and ISNIs at ' + rcg.timestamp() + '...')
rcg.create_nodepairs_and_edges_params(name1='ORGANIZATION_NAME',
category1='organization',
value1=parsed_content['ORGANIZATION_NAME'],
name2='ISNI',
category2='person',
value2=parsed_content['ISNI'])
print('\nDone at ' + rcg.timestamp() + '.\n')
return
# ############################################
# ################### main ###################
# ############################################
rcg.print_commandline_arguments(argument_list=sys.argv)
config = configparser.ConfigParser()
config.read(rcg.get_ricgraph_ini_file())
try:
YODA_URL = config['Yoda_harvesting']['yoda_url']
YODA_SET = config['Yoda_harvesting']['yoda_set']
if YODA_URL == '' or YODA_SET == '':
print('Ricgraph initialization: error, yoda_url or yoda_set empty in Ricgraph ini file, exiting.')
exit(1)
YODA_HEADERS['set'] = YODA_SET
except KeyError:
print('Ricgraph initialization: error, yoda_url or yoda_set not found in Ricgraph ini file, exiting.')
exit(1)
print('\nPreparing graph...')
rcg.open_ricgraph()
empty_graph = rcg.get_commandline_argument(argument='--empty_ricgraph',
argument_list=sys.argv)
if empty_graph == '':
# Empty Ricgraph, choose one of the following.
# rcg.empty_ricgraph(answer='yes')
# rcg.empty_ricgraph(answer='no')
rcg.empty_ricgraph()
else:
rcg.empty_ricgraph(answer=empty_graph)
rcg.graphdb_nr_accesses_print()
parse_yoda_data = harvest_and_parse_yoda_datacite_data(url=YODA_URL,
headers=YODA_HEADERS,
harvest_filename=YODA_HARVEST_FILENAME)
if parse_yoda_data is None or parse_yoda_data.empty:
print('There are no data from Yoda to harvest.\n')
else:
rcg.write_dataframe_to_csv(filename=YODA_DATA_FILENAME,
df=parse_yoda_data)
parsed_yoda_datacite_to_ricgraph(parsed_content=parse_yoda_data)
rcg.graphdb_nr_accesses_print()
rcg.close_ricgraph()