Skip to content

Commit

Permalink
Metadata: fix incorrect person identifiers when transforming metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
lwesterhof committed Nov 21, 2023
1 parent bec08cb commit 58984fc
Showing 1 changed file with 91 additions and 10 deletions.
101 changes: 91 additions & 10 deletions schema_transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def _default2_default3(ctx, m):
"""
Add affiliation identifiers to creators and contributors.
Tags are renamed to Keywords and Related Datapackage to Related Resource.
Tags are renamed to Keywords, Related Datapackage renamed to Related Resource and improved Affiliation and Person Identifiers.
:param ctx: Combined type of a callback and rei struct
:param m: Metadata to transform (default-2)
Expand All @@ -121,19 +121,67 @@ def _default2_default3(ctx, m):
if m.get('Creator', False):
# For this creator step through all its affiliations
for creator in m['Creator']:
new_affiliations = []
affiliations = []
for affiliation in creator['Affiliation']:
new_affiliations.append({"Affiliation_Name": affiliation, "Affiliation_Identifier": ""})
creator['Affiliation'] = new_affiliations
affiliations.append({"Affiliation_Name": affiliation, "Affiliation_Identifier": ""})
creator['Affiliation'] = affiliations

person_identifiers = []
for person_identifier in creator['Person_Identifier']:
if person_identifier.get('Name_Identifier_Scheme', True):
continue
elif person_identifier.get('Name_Identifier_Scheme', None) == 'ORCID':
# Check for incorrect ORCID format.
if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", person_identifier.get('Name_Identifier', None)):
corrected_orcid = correctify_orcid(person_identifier['Name_Identifier'])
# Only it an actual correction took place change the value and mark this data as 'changed'.
if corrected_orcid != person_identifier['Name_Identifier']:
person_identifier['Name_Identifier'] = corrected_orcid
elif person_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)':
# Check for incorrect ResearcherID format.
if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", person_identifier.get('Name_Identifier', None)):
corrected_researcher_id = correctify_researcher_id(person_identifier['Name_Identifier'])
# Only it an actual correction took place change the value and mark this data as 'changed'.
if corrected_researcher_id != person_identifier['Name_Identifier']:
person_identifier['Name_Identifier'] = corrected_researcher_id

person_identifiers.append({"Name_Identifier_Scheme": person_identifier['Name_Identifier_Scheme'], "Name_Identifier": person_identifier['Name_Identifier']})

if len(person_identifiers) > 0:
creator['Person_Identifier'] = person_identifiers

if m.get('Contributor', False):
# For this contributor step through all its affiliations
for contrib in m['Contributor']:
new_affiliations = []
if contrib.get('Affiliation', False):
for affiliation in contrib['Affiliation']:
new_affiliations.append({"Affiliation_Name": affiliation, "Affiliation_Identifier": ""})
contrib['Affiliation'] = new_affiliations
for contributor in m['Contributor']:
affiliations = []
if contributor.get('Affiliation', False):
for affiliation in contributor['Affiliation']:
affiliations.append({"Affiliation_Name": affiliation, "Affiliation_Identifier": ""})
contributor['Affiliation'] = affiliations

person_identifiers = []
for person_identifier in contributor['Person_Identifier']:
if person_identifier.get('Name_Identifier_Scheme', True):
continue
elif person_identifier.get('Name_Identifier_Scheme', None) == 'ORCID':
# Check for incorrect ORCID format.
if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", person_identifier.get('Name_Identifier', None)):
corrected_orcid = correctify_orcid(person_identifier['Name_Identifier'])
# Only it an actual correction took place change the value and mark this data as 'changed'.
if corrected_orcid != person_identifier['Name_Identifier']:
person_identifier['Name_Identifier'] = corrected_orcid
elif person_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)':
# Check for incorrect ResearcherID format.
if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", person_identifier.get('Name_Identifier', None)):
corrected_researcher_id = correctify_researcher_id(person_identifier['Name_Identifier'])
# Only it an actual correction took place change the value and mark this data as 'changed'.
if corrected_researcher_id != person_identifier['Name_Identifier']:
person_identifier['Name_Identifier'] = corrected_researcher_id

person_identifiers.append({"Name_Identifier_Scheme": person_identifier['Name_Identifier_Scheme'], "Name_Identifier": person_identifier['Name_Identifier']})

if len(person_identifiers) > 0:
contributor['Person_Identifier'] = person_identifiers

# Rename Tags to Keywords
if m.get('Tag', False):
Expand Down Expand Up @@ -643,3 +691,36 @@ def get(src_id, dst_id):

x = transformations.get(src_id)
return None if x is None else x.get(dst_id)


def correctify_orcid(org_orcid):
"""Correct illformatted ORCID."""
# Get rid of all spaces.
orcid = org_orcid.replace(' ', '')

# Upper-case X.
orcid = org_orcid.replace('x', 'X')

# The last part should hold a valid id like eg: 1234-1234-1234-123X.
# If not, it is impossible to correct it to the valid orcid format
orcs = orcid.split('/')
if not re.search("^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", orcs[-1]):
# Return original value.
return org_orcid

return "https://orcid.org/{}".format(orcs[-1])


def correctify_researcher_id(org_researcher_id):
"""Correct illformatted ResearcherID."""
# Get rid of all spaces.
researcher_id = org_researcher_id.replace(' ', '')

# The last part should hold a valid id like eg: A-1234-1234
# If not, it is impossible to correct it to the valid ResearcherID format
orcs = researcher_id.split('/')
if not re.search("^[A-Z]-[0-9]{4}-[0-9]{4}$", orcs[-1]):
# Return original value.
return org_researcher_id

return "https://www.researcherid.com/rid/{}".format(orcs[-1])

0 comments on commit 58984fc

Please sign in to comment.