diff --git a/schema_transformations.py b/schema_transformations.py index a630c89c3..4a730da67 100644 --- a/schema_transformations.py +++ b/schema_transformations.py @@ -111,7 +111,7 @@ def _default2_default3(ctx, m): """ Add affiliation identifiers to creators and contributors. - Tags are renamed to Keywords and Related Datapackage to Related Resource. + Tags are renamed to Keywords, Related Datapackage renamed to Related Resource and improved Affiliation and Person Identifiers. :param ctx: Combined type of a callback and rei struct :param m: Metadata to transform (default-2) @@ -121,19 +121,67 @@ def _default2_default3(ctx, m): if m.get('Creator', False): # For this creator step through all its affiliations for creator in m['Creator']: - new_affiliations = [] + affiliations = [] for affiliation in creator['Affiliation']: - new_affiliations.append({"Affiliation_Name": affiliation, "Affiliation_Identifier": ""}) - creator['Affiliation'] = new_affiliations + affiliations.append({"Affiliation_Name": affiliation, "Affiliation_Identifier": ""}) + creator['Affiliation'] = affiliations + + personal_identifiers = [] + for personal_identifier in creator['Person_Identifier']: + if personal_identifier.get('Name_Identifier_Scheme', True): + continue + elif personal_identifier.get('Name_Identifier_Scheme', None) == 'ORCID': + # Check for incorrect ORCID format. + if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", personal_identifier.get('Name_Identifier', None)): + corrected_orcid = correctify_orcid(personal_identifier['Name_Identifier']) + # Only it an actual correction took place change the value and mark this data as 'changed'. + if corrected_orcid != personal_identifier['Name_Identifier']: + personal_identifier['Name_Identifier'] = corrected_orcid + elif personal_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)': + # Check for incorrect ResearcherID format. + if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", personal_identifier.get('Name_Identifier', None)): + corrected_researcher_id = correctify_researcher_id(personal_identifier['Name_Identifier']) + # Only it an actual correction took place change the value and mark this data as 'changed'. + if corrected_researcher_id != personal_identifier['Name_Identifier']: + personal_identifier['Name_Identifier'] = corrected_researcher_id + + personal_identifiers.append({"Name_Identifier_Scheme": personal_identifier['Name_Identifier_Scheme'], "Name_Identifier": personal_identifier['Name_Identifier']}) + + if len(personal_identifiers) > 0: + creator['Person_Identifier'] = personal_identifiers if m.get('Contributor', False): # For this contributor step through all its affiliations - for contrib in m['Contributor']: - new_affiliations = [] - if contrib.get('Affiliation', False): - for affiliation in contrib['Affiliation']: - new_affiliations.append({"Affiliation_Name": affiliation, "Affiliation_Identifier": ""}) - contrib['Affiliation'] = new_affiliations + for contributor in m['Contributor']: + affiliations = [] + if contributor.get('Affiliation', False): + for affiliation in contributor['Affiliation']: + affiliations.append({"Affiliation_Name": affiliation, "Affiliation_Identifier": ""}) + contributor['Affiliation'] = affiliations + + personal_identifiers = [] + for personal_identifier in contributor['Person_Identifier']: + if personal_identifier.get('Name_Identifier_Scheme', True): + continue + elif personal_identifier.get('Name_Identifier_Scheme', None) == 'ORCID': + # Check for incorrect ORCID format. + if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", personal_identifier.get('Name_Identifier', None)): + corrected_orcid = correctify_orcid(personal_identifier['Name_Identifier']) + # Only it an actual correction took place change the value and mark this data as 'changed'. + if corrected_orcid != personal_identifier['Name_Identifier']: + personal_identifier['Name_Identifier'] = corrected_orcid + elif personal_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)': + # Check for incorrect ResearcherID format. + if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", personal_identifier.get('Name_Identifier', None)): + corrected_researcher_id = correctify_researcher_id(personal_identifier['Name_Identifier']) + # Only it an actual correction took place change the value and mark this data as 'changed'. + if corrected_researcher_id != personal_identifier['Name_Identifier']: + personal_identifier['Name_Identifier'] = corrected_researcher_id + + personal_identifiers.append({"Name_Identifier_Scheme": personal_identifier['Name_Identifier_Scheme'], "Name_Identifier": personal_identifier['Name_Identifier']}) + + if len(personal_identifiers) > 0: + contributor['Person_Identifier'] = personal_identifiers # Rename Tags to Keywords if m.get('Tag', False): @@ -643,3 +691,36 @@ def get(src_id, dst_id): x = transformations.get(src_id) return None if x is None else x.get(dst_id) + + +def correctify_orcid(org_orcid): + """Correct illformatted ORCID.""" + # Get rid of all spaces. + orcid = org_orcid.replace(' ', '') + + # Upper-case X. + orcid = org_orcid.replace('x', 'X') + + # The last part should hold a valid id like eg: 1234-1234-1234-123X. + # If not, it is impossible to correct it to the valid orcid format + orcs = orcid.split('/') + if not re.search("^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", orcs[-1]): + # Return original value. + return org_orcid + + return "https://orcid.org/{}".format(orcs[-1]) + + +def correctify_researcher_id(org_researcher_id): + """Correct illformatted ResearcherID.""" + # Get rid of all spaces. + researcher_id = org_researcher_id.replace(' ', '') + + # The last part should hold a valid id like eg: A-1234-1234 + # If not, it is impossible to correct it to the valid ResearcherID format + orcs = researcher_id.split('/') + if not re.search("^[A-Z]-[0-9]{4}-[0-9]{4}$", orcs[-1]): + # Return original value. + return org_researcher_id + + return "https://www.researcherid.com/rid/{}".format(orcs[-1])