Skip to content

Commit

Permalink
YDA-5951 add transformation default-3 ISNI and Scopus ID
Browse files Browse the repository at this point in the history
The metadata schema transformation code from default-2 to default-3 transformed ORCID-IDs and Researcher IDs, but not Scopus IDs and ISNI IDs. Because of this, metadata files with the default-2 schema containing Scopus or ISNI IDs can usually not be converted automatically to default-3.

Solution:

Add a transformation function that handles transformation of an ISNI and Scopus ID that consists of a series of digits (with optional spaces) to the format that is specified in the default-3 schema
  • Loading branch information
leonidastri authored Sep 26, 2024
1 parent 15dbc78 commit d1bb8c8
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 37 deletions.
89 changes: 52 additions & 37 deletions schema_transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import re

from schema_transformations_utils import correctify_isni, correctify_orcid, correctify_researcher_id, correctify_scopus

import meta
from util import *

Expand Down Expand Up @@ -128,21 +130,44 @@ def _default2_default3(ctx, m):

person_identifiers = []
for person_identifier in creator.get('Person_Identifier', []):
# Check ORCID
if person_identifier.get('Name_Identifier_Scheme', None) == 'ORCID':
# Check for incorrect ORCID format.
if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", person_identifier.get('Name_Identifier', None)):
corrected_orcid = correctify_orcid(person_identifier['Name_Identifier'])
# Only it an actual correction took place change the value and mark this data as 'changed'.
# Only if an actual correction took place change the value and mark this data as 'changed'.
if corrected_orcid is None:
log.write(ctx, "Warning: could not correct ORCID %s during schema transformation. It needs to be fixed manually."
% (person_identifier['Name_Identifier']))
elif corrected_orcid != person_identifier['Name_Identifier']:
person_identifier['Name_Identifier'] = corrected_orcid
# Check Scopus
elif person_identifier.get('Name_Identifier_Scheme', None) == 'Author identifier (Scopus)':
# Check for incorrect Scopus format.
if not re.search("^\d{1,11}$", person_identifier.get('Name_Identifier', None)):
corrected_scopus = correctify_scopus(person_identifier['Name_Identifier'])
# Only if an actual correction took place change the value and mark this data as 'changed'.
if corrected_scopus is None:
log.write(ctx, "Warning: could not correct Scopus %s during schema transformation. It needs to be fixed manually."
% (person_identifier['Name_Identifier']))
elif corrected_scopus != person_identifier['Name_Identifier']:
person_identifier['Name_Identifier'] = corrected_scopus
# Check ISNI
elif person_identifier.get('Name_Identifier_Scheme', None) == 'ISNI':
# Check for incorrect ISNI format.
if not re.search("^(https://isni.org/isni/)[0-9]{15}[0-9X]$", person_identifier.get('Name_Identifier', None)):
corrected_isni = correctify_isni(person_identifier['Name_Identifier'])
# Only if an actual correction took place change the value and mark this data as 'changed'.
if corrected_isni is None:
log.write(ctx, "Warning: could not correct ISNI %s during schema transformation. It needs to be fixed manually."
% (person_identifier['Name_Identifier']))
elif corrected_isni != person_identifier['Name_Identifier']:
person_identifier['Name_Identifier'] = corrected_isni
elif person_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)':
# Check for incorrect ResearcherID format.
if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", person_identifier.get('Name_Identifier', None)):
corrected_researcher_id = correctify_researcher_id(person_identifier['Name_Identifier'])
# Only it an actual correction took place change the value and mark this data as 'changed'.
# Only if an actual correction took place change the value and mark this data as 'changed'.
if corrected_researcher_id != person_identifier['Name_Identifier']:
person_identifier['Name_Identifier'] = corrected_researcher_id
elif 'Name_Identifier_Scheme' not in person_identifier:
Expand All @@ -164,21 +189,44 @@ def _default2_default3(ctx, m):

person_identifiers = []
for person_identifier in contributor.get('Person_Identifier', []):
# Check ORCID
if person_identifier.get('Name_Identifier_Scheme', None) == 'ORCID':
# Check for incorrect ORCID format.
if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", person_identifier.get('Name_Identifier', None)):
corrected_orcid = correctify_orcid(person_identifier['Name_Identifier'])
# Only it an actual correction took place change the value and mark this data as 'changed'.
# Only if an actual correction took place change the value and mark this data as 'changed'.
if corrected_orcid is None:
log.write(ctx, "Warning: could not correct ORCID %s during schema transformation. It needs to be fixed manually."
% (person_identifier['Name_Identifier']))
elif corrected_orcid != person_identifier['Name_Identifier']:
person_identifier['Name_Identifier'] = corrected_orcid
# Check Scopus
elif person_identifier.get('Name_Identifier_Scheme', None) == 'Author identifier (Scopus)':
# Check for incorrect Scopus format.
if not re.search("^\d{1,11}$", person_identifier.get('Name_Identifier', None)):
corrected_scopus = correctify_scopus(person_identifier['Name_Identifier'])
# Only if an actual correction took place change the value and mark this data as 'changed'.
if corrected_scopus is None:
log.write(ctx, "Warning: could not correct Scopus %s during schema transformation. It needs to be fixed manually."
% (person_identifier['Name_Identifier']))
elif corrected_scopus != person_identifier['Name_Identifier']:
person_identifier['Name_Identifier'] = corrected_scopus
# Check ISNI
elif person_identifier.get('Name_Identifier_Scheme', None) == 'ISNI':
# Check for incorrect ISNI format.
if not re.search("^(https://isni.org/isni/)[0-9]{15}[0-9X]$", person_identifier.get('Name_Identifier', None)):
corrected_isni = correctify_isni(person_identifier['Name_Identifier'])
# Only if an actual correction took place change the value and mark this data as 'changed'.
if corrected_isni is None:
log.write(ctx, "Warning: could not correct ISNI %s during schema transformation. It needs to be fixed manually."
% (person_identifier['Name_Identifier']))
elif corrected_isni != person_identifier['Name_Identifier']:
person_identifier['Name_Identifier'] = corrected_isni
elif person_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)':
# Check for incorrect ResearcherID format.
if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", person_identifier.get('Name_Identifier', None)):
corrected_researcher_id = correctify_researcher_id(person_identifier['Name_Identifier'])
# Only it an actual correction took place change the value and mark this data as 'changed'.
# Only if an actual correction took place change the value and mark this data as 'changed'.
if corrected_researcher_id != person_identifier['Name_Identifier']:
person_identifier['Name_Identifier'] = corrected_researcher_id
elif 'Name_Identifier_Scheme' not in person_identifier:
Expand Down Expand Up @@ -702,36 +750,3 @@ def get(src_id, dst_id):

x = transformations.get(src_id)
return None if x is None else x.get(dst_id)


def correctify_orcid(org_orcid):
"""Correct illformatted ORCID."""
# Get rid of all spaces.
orcid = org_orcid.replace(' ', '')

# Upper-case X.
orcid = org_orcid.replace('x', 'X')

# The last part should hold a valid id like eg: 1234-1234-1234-123X.
# If not, it is impossible to correct it to the valid orcid format
orcs = orcid.split('/')
if not re.search("^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", orcs[-1]):
# Return original value.
return org_orcid

return "https://orcid.org/{}".format(orcs[-1])


def correctify_researcher_id(org_researcher_id):
"""Correct illformatted ResearcherID."""
# Get rid of all spaces.
researcher_id = org_researcher_id.replace(' ', '')

# The last part should hold a valid id like eg: A-1234-1234
# If not, it is impossible to correct it to the valid ResearcherID format
orcs = researcher_id.split('/')
if not re.search("^[A-Z]-[0-9]{4}-[0-9]{4}$", orcs[-1]):
# Return original value.
return org_researcher_id

return "https://www.researcherid.com/rid/{}".format(orcs[-1])
61 changes: 61 additions & 0 deletions schema_transformations_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import re


def correctify_orcid(org_orcid):
"""Correct illformatted ORCID."""
# Get rid of all spaces.
orcid = org_orcid.replace(' ', '')

# Upper-case X.
orcid = orcid.replace('x', 'X')

# The last part should hold a valid id like eg: 1234-1234-1234-123X.
# If not, it is impossible to correct it to the valid orcid format
orcs = orcid.split('/')
if not re.search("^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", orcs[-1]):
return None

return "https://orcid.org/{}".format(orcs[-1])


def correctify_scopus(org_scopus):
"""Correct illformatted Scopus."""
# Get rid of all spaces.
new_scopus = org_scopus.replace(' ', '')

if not re.search("^\d{1,11}$", new_scopus):
return None

return new_scopus


def correctify_isni(org_isni):
"""Correct ill-formatted ISNI."""
# Remove all spaces.
new_isni = org_isni.replace(' ', '')

# Upper-case X.
new_isni = new_isni.replace('x', 'X')

# The last part should hold a valid id like eg: 123412341234123X.
# If not, it is impossible to correct it to the valid isni format
new_isni = new_isni.split('/')
if not re.search("^[0-9]{15}[0-9X]$", new_isni[-1]):
return None

return "https://isni.org/isni/{}".format(new_isni[-1])


def correctify_researcher_id(org_researcher_id):
"""Correct illformatted ResearcherID."""
# Get rid of all spaces.
researcher_id = org_researcher_id.replace(' ', '')

# The last part should hold a valid id like eg: A-1234-1234
# If not, it is impossible to correct it to the valid ResearcherID format
orcs = researcher_id.split('/')
if not re.search("^[A-Z]-[0-9]{4}-[0-9]{4}$", orcs[-1]):
# Return original value.
return org_researcher_id

return "https://www.researcherid.com/rid/{}".format(orcs[-1])
93 changes: 93 additions & 0 deletions unit-tests/test_schema_transformations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# -*- coding: utf-8 -*-
"""Unit tests for the correctify functions in schema_transformations"""

__copyright__ = 'Copyright (c) 2024, Utrecht University'
__license__ = 'GPLv3, see LICENSE'

import sys
from unittest import TestCase

sys.path.append('..')

from schema_transformations_utils import correctify_isni, correctify_orcid, correctify_scopus


class CorrectifyIsniTest(TestCase):
def test_isni_correct_format(self):
"""Test ISNI with correct format"""
isni = "https://isni.org/isni/1234123412341234"
self.assertEqual(correctify_isni(isni), isni)

def test_isni_correct_format_containing_x(self):
"""Test ISNI with correct format"""
isni = "https://isni.org/isni/123412341234123x"
correct_isni = "https://isni.org/isni/123412341234123X"
self.assertEqual(correctify_isni(isni), correct_isni)

def test_isni_invalid_format(self):
"""Test ISNI with invalid format (1 less number)"""
isni = "123412341234123"
self.assertIsNone(correctify_isni(isni))

def test_isni_malformed_format(self):
"""Test ISNI with invalid format"""
isni = "foobar0123456789"
self.assertIsNone(correctify_isni(isni))

def test_isni_with_spaces(self):
"""Test ISNI that contains spaces and should be corrected"""
isni = " https://isni.org/isni/123412341234123x "
corrected_isni = "https://isni.org/isni/123412341234123X"
self.assertEqual(correctify_isni(isni), corrected_isni)


class CorrectifyOrcidTest(TestCase):
def test_orcid_correct_format(self):
"""Test ORCID with correct format"""
orcid = "https://orcid.org/1234-1234-1234-1234"
self.assertEqual(correctify_orcid(orcid), orcid)

def test_orcid_correct_format_containing_x(self):
"""Test ORCID with correct format"""
orcid = "https://orcid.org/1234-1234-1234-123x"
correct_orcid = "https://orcid.org/1234-1234-1234-123X"
self.assertEqual(correctify_orcid(orcid), correct_orcid)

def test_orcid_invalid_format(self):
"""Test ORCID with invalid format (1 less number)"""
orcid = "1234-1234-1234-123"
self.assertIsNone(correctify_orcid(orcid))

def test_orcid_malformed_format(self):
"""Test ORCID with invalid format"""
orcid = "1234-foo-bar-1234"
self.assertIsNone(correctify_orcid(orcid))

def test_orcid_with_spaces(self):
"""Test ORCID that contains spaces and should be corrected"""
orcid = " https://orcid.org/1234-1234-1234-123x "
corrected_orcid = "https://orcid.org/1234-1234-1234-123X"
self.assertEqual(correctify_orcid(orcid), corrected_orcid)


class CorrectifyScopusTest(TestCase):
def test_correctify_format(self):
"""Test SCOPUS with correct format"""
scopus = "12345678901"
self.assertEqual(correctify_scopus(scopus), scopus)

def test_correctify_invalid_format(self):
"""Test SCOPUS with invalid format"""
scopus = "123456789012"
self.assertIsNone(correctify_scopus(scopus))

def test_malformed_format(self):
"""Test SCOPUS with invalid format"""
scopus = "foobar1234"
self.assertIsNone(correctify_scopus(scopus))

def test_orcid_with_spaces(self):
"""Test SCOPUS that contains spaces and should be corrected"""
scopus = " 01234567890 "
corrected_scopus = "01234567890"
self.assertEqual(correctify_scopus(scopus), corrected_scopus)
4 changes: 4 additions & 0 deletions unit-tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,17 @@
from test_intake import IntakeTest
from test_policies import PoliciesTest
from test_revisions import RevisionTest
from test_schema_transformations import CorrectifyIsniTest, CorrectifyOrcidTest, CorrectifyScopusTest
from test_util_misc import UtilMiscTest
from test_util_pathutil import UtilPathutilTest
from test_util_yoda_names import UtilYodaNamesTest


def suite():
test_suite = TestSuite()
test_suite.addTest(makeSuite(CorrectifyIsniTest))
test_suite.addTest(makeSuite(CorrectifyOrcidTest))
test_suite.addTest(makeSuite(CorrectifyScopusTest))
test_suite.addTest(makeSuite(GroupImportTest))
test_suite.addTest(makeSuite(IntakeTest))
test_suite.addTest(makeSuite(PoliciesTest))
Expand Down

0 comments on commit d1bb8c8

Please sign in to comment.