Skip to content

Commit

Permalink
Metadata: fix incorrect person identifiers when transforming metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
lwesterhof committed Nov 21, 2023
1 parent bec08cb commit f3a09d2
Showing 1 changed file with 91 additions and 10 deletions.
101 changes: 91 additions & 10 deletions schema_transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def _default2_default3(ctx, m):
"""
Add affiliation identifiers to creators and contributors.
Tags are renamed to Keywords and Related Datapackage to Related Resource.
Tags are renamed to Keywords, Related Datapackage renamed to Related Resource and improved Affiliation and Person Identifiers.
:param ctx: Combined type of a callback and rei struct
:param m: Metadata to transform (default-2)
Expand All @@ -121,19 +121,67 @@ def _default2_default3(ctx, m):
if m.get('Creator', False):
# For this creator step through all its affiliations
for creator in m['Creator']:
new_affiliations = []
affiliations = []
for affiliation in creator['Affiliation']:
new_affiliations.append({"Affiliation_Name": affiliation, "Affiliation_Identifier": ""})
creator['Affiliation'] = new_affiliations
affiliations.append({"Affiliation_Name": affiliation, "Affiliation_Identifier": ""})
creator['Affiliation'] = affiliations

personal_identifiers = []
for personal_identifier in creator['Person_Identifier']:
if personal_identifier.get('Name_Identifier_Scheme', True):
continue
elif personal_identifier.get('Name_Identifier_Scheme', None) == 'ORCID':
# Check for incorrect ORCID format.
if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", personal_identifier.get('Name_Identifier', None)):
corrected_orcid = correctify_orcid(personal_identifier['Name_Identifier'])
# Only it an actual correction took place change the value and mark this data as 'changed'.
if corrected_orcid != personal_identifier['Name_Identifier']:
personal_identifier['Name_Identifier'] = corrected_orcid
elif personal_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)':
# Check for incorrect ResearcherID format.
if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", personal_identifier.get('Name_Identifier', None)):
corrected_researcher_id = correctify_researcher_id(personal_identifier['Name_Identifier'])
# Only it an actual correction took place change the value and mark this data as 'changed'.
if corrected_researcher_id != personal_identifier['Name_Identifier']:
personal_identifier['Name_Identifier'] = corrected_researcher_id

personal_identifiers.append({"Name_Identifier_Scheme": personal_identifier['Name_Identifier_Scheme'], "Name_Identifier": personal_identifier['Name_Identifier']})

if len(personal_identifiers) > 0:
creator['Person_Identifier'] = personal_identifiers

if m.get('Contributor', False):
# For this contributor step through all its affiliations
for contrib in m['Contributor']:
new_affiliations = []
if contrib.get('Affiliation', False):
for affiliation in contrib['Affiliation']:
new_affiliations.append({"Affiliation_Name": affiliation, "Affiliation_Identifier": ""})
contrib['Affiliation'] = new_affiliations
for contributor in m['Contributor']:
affiliations = []
if contributor.get('Affiliation', False):
for affiliation in contributor['Affiliation']:
affiliations.append({"Affiliation_Name": affiliation, "Affiliation_Identifier": ""})
contributor['Affiliation'] = affiliations

personal_identifiers = []
for personal_identifier in contributor['Person_Identifier']:
if personal_identifier.get('Name_Identifier_Scheme', True):
continue
elif personal_identifier.get('Name_Identifier_Scheme', None) == 'ORCID':
# Check for incorrect ORCID format.
if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", personal_identifier.get('Name_Identifier', None)):
corrected_orcid = correctify_orcid(personal_identifier['Name_Identifier'])
# Only it an actual correction took place change the value and mark this data as 'changed'.
if corrected_orcid != personal_identifier['Name_Identifier']:
personal_identifier['Name_Identifier'] = corrected_orcid
elif personal_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)':
# Check for incorrect ResearcherID format.
if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", personal_identifier.get('Name_Identifier', None)):
corrected_researcher_id = correctify_researcher_id(personal_identifier['Name_Identifier'])
# Only it an actual correction took place change the value and mark this data as 'changed'.
if corrected_researcher_id != personal_identifier['Name_Identifier']:
personal_identifier['Name_Identifier'] = corrected_researcher_id

personal_identifiers.append({"Name_Identifier_Scheme": personal_identifier['Name_Identifier_Scheme'], "Name_Identifier": personal_identifier['Name_Identifier']})

if len(personal_identifiers) > 0:
contributor['Person_Identifier'] = personal_identifiers

# Rename Tags to Keywords
if m.get('Tag', False):
Expand Down Expand Up @@ -643,3 +691,36 @@ def get(src_id, dst_id):

x = transformations.get(src_id)
return None if x is None else x.get(dst_id)


def correctify_orcid(org_orcid):
"""Correct illformatted ORCID."""
# Get rid of all spaces.
orcid = org_orcid.replace(' ', '')

# Upper-case X.
orcid = org_orcid.replace('x', 'X')

# The last part should hold a valid id like eg: 1234-1234-1234-123X.
# If not, it is impossible to correct it to the valid orcid format
orcs = orcid.split('/')
if not re.search("^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", orcs[-1]):
# Return original value.
return org_orcid

return "https://orcid.org/{}".format(orcs[-1])


def correctify_researcher_id(org_researcher_id):
"""Correct illformatted ResearcherID."""
# Get rid of all spaces.
researcher_id = org_researcher_id.replace(' ', '')

# The last part should hold a valid id like eg: A-1234-1234
# If not, it is impossible to correct it to the valid ResearcherID format
orcs = researcher_id.split('/')
if not re.search("^[A-Z]-[0-9]{4}-[0-9]{4}$", orcs[-1]):
# Return original value.
return org_researcher_id

return "https://www.researcherid.com/rid/{}".format(orcs[-1])

0 comments on commit f3a09d2

Please sign in to comment.