-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
YDA-5951 add transformation default-3 ISNI and Scopus ID
The metadata schema transformation code from default-2 to default-3 transformed ORCID-IDs and Researcher IDs, but not Scopus IDs and ISNI IDs. Because of this, metadata files with the default-2 schema containing Scopus or ISNI IDs can usually not be converted automatically to default-3. Solution: Add a transformation function that handles transformation of an ISNI and Scopus ID that consists of a series of digits (with optional spaces) to the format that is specified in the default-3 schema
- Loading branch information
1 parent
15dbc78
commit d1bb8c8
Showing
4 changed files
with
210 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import re | ||
|
||
|
||
def correctify_orcid(org_orcid): | ||
"""Correct illformatted ORCID.""" | ||
# Get rid of all spaces. | ||
orcid = org_orcid.replace(' ', '') | ||
|
||
# Upper-case X. | ||
orcid = orcid.replace('x', 'X') | ||
|
||
# The last part should hold a valid id like eg: 1234-1234-1234-123X. | ||
# If not, it is impossible to correct it to the valid orcid format | ||
orcs = orcid.split('/') | ||
if not re.search("^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", orcs[-1]): | ||
return None | ||
|
||
return "https://orcid.org/{}".format(orcs[-1]) | ||
|
||
|
||
def correctify_scopus(org_scopus): | ||
"""Correct illformatted Scopus.""" | ||
# Get rid of all spaces. | ||
new_scopus = org_scopus.replace(' ', '') | ||
|
||
if not re.search("^\d{1,11}$", new_scopus): | ||
return None | ||
|
||
return new_scopus | ||
|
||
|
||
def correctify_isni(org_isni): | ||
"""Correct ill-formatted ISNI.""" | ||
# Remove all spaces. | ||
new_isni = org_isni.replace(' ', '') | ||
|
||
# Upper-case X. | ||
new_isni = new_isni.replace('x', 'X') | ||
|
||
# The last part should hold a valid id like eg: 123412341234123X. | ||
# If not, it is impossible to correct it to the valid isni format | ||
new_isni = new_isni.split('/') | ||
if not re.search("^[0-9]{15}[0-9X]$", new_isni[-1]): | ||
return None | ||
|
||
return "https://isni.org/isni/{}".format(new_isni[-1]) | ||
|
||
|
||
def correctify_researcher_id(org_researcher_id): | ||
"""Correct illformatted ResearcherID.""" | ||
# Get rid of all spaces. | ||
researcher_id = org_researcher_id.replace(' ', '') | ||
|
||
# The last part should hold a valid id like eg: A-1234-1234 | ||
# If not, it is impossible to correct it to the valid ResearcherID format | ||
orcs = researcher_id.split('/') | ||
if not re.search("^[A-Z]-[0-9]{4}-[0-9]{4}$", orcs[-1]): | ||
# Return original value. | ||
return org_researcher_id | ||
|
||
return "https://www.researcherid.com/rid/{}".format(orcs[-1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
# -*- coding: utf-8 -*- | ||
"""Unit tests for the correctify functions in schema_transformations""" | ||
|
||
__copyright__ = 'Copyright (c) 2024, Utrecht University' | ||
__license__ = 'GPLv3, see LICENSE' | ||
|
||
import sys | ||
from unittest import TestCase | ||
|
||
sys.path.append('..') | ||
|
||
from schema_transformations_utils import correctify_isni, correctify_orcid, correctify_scopus | ||
|
||
|
||
class CorrectifyIsniTest(TestCase): | ||
def test_isni_correct_format(self): | ||
"""Test ISNI with correct format""" | ||
isni = "https://isni.org/isni/1234123412341234" | ||
self.assertEqual(correctify_isni(isni), isni) | ||
|
||
def test_isni_correct_format_containing_x(self): | ||
"""Test ISNI with correct format""" | ||
isni = "https://isni.org/isni/123412341234123x" | ||
correct_isni = "https://isni.org/isni/123412341234123X" | ||
self.assertEqual(correctify_isni(isni), correct_isni) | ||
|
||
def test_isni_invalid_format(self): | ||
"""Test ISNI with invalid format (1 less number)""" | ||
isni = "123412341234123" | ||
self.assertIsNone(correctify_isni(isni)) | ||
|
||
def test_isni_malformed_format(self): | ||
"""Test ISNI with invalid format""" | ||
isni = "foobar0123456789" | ||
self.assertIsNone(correctify_isni(isni)) | ||
|
||
def test_isni_with_spaces(self): | ||
"""Test ISNI that contains spaces and should be corrected""" | ||
isni = " https://isni.org/isni/123412341234123x " | ||
corrected_isni = "https://isni.org/isni/123412341234123X" | ||
self.assertEqual(correctify_isni(isni), corrected_isni) | ||
|
||
|
||
class CorrectifyOrcidTest(TestCase): | ||
def test_orcid_correct_format(self): | ||
"""Test ORCID with correct format""" | ||
orcid = "https://orcid.org/1234-1234-1234-1234" | ||
self.assertEqual(correctify_orcid(orcid), orcid) | ||
|
||
def test_orcid_correct_format_containing_x(self): | ||
"""Test ORCID with correct format""" | ||
orcid = "https://orcid.org/1234-1234-1234-123x" | ||
correct_orcid = "https://orcid.org/1234-1234-1234-123X" | ||
self.assertEqual(correctify_orcid(orcid), correct_orcid) | ||
|
||
def test_orcid_invalid_format(self): | ||
"""Test ORCID with invalid format (1 less number)""" | ||
orcid = "1234-1234-1234-123" | ||
self.assertIsNone(correctify_orcid(orcid)) | ||
|
||
def test_orcid_malformed_format(self): | ||
"""Test ORCID with invalid format""" | ||
orcid = "1234-foo-bar-1234" | ||
self.assertIsNone(correctify_orcid(orcid)) | ||
|
||
def test_orcid_with_spaces(self): | ||
"""Test ORCID that contains spaces and should be corrected""" | ||
orcid = " https://orcid.org/1234-1234-1234-123x " | ||
corrected_orcid = "https://orcid.org/1234-1234-1234-123X" | ||
self.assertEqual(correctify_orcid(orcid), corrected_orcid) | ||
|
||
|
||
class CorrectifyScopusTest(TestCase): | ||
def test_correctify_format(self): | ||
"""Test SCOPUS with correct format""" | ||
scopus = "12345678901" | ||
self.assertEqual(correctify_scopus(scopus), scopus) | ||
|
||
def test_correctify_invalid_format(self): | ||
"""Test SCOPUS with invalid format""" | ||
scopus = "123456789012" | ||
self.assertIsNone(correctify_scopus(scopus)) | ||
|
||
def test_malformed_format(self): | ||
"""Test SCOPUS with invalid format""" | ||
scopus = "foobar1234" | ||
self.assertIsNone(correctify_scopus(scopus)) | ||
|
||
def test_orcid_with_spaces(self): | ||
"""Test SCOPUS that contains spaces and should be corrected""" | ||
scopus = " 01234567890 " | ||
corrected_scopus = "01234567890" | ||
self.assertEqual(correctify_scopus(scopus), corrected_scopus) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters