Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

185 remove rlm unicode #186

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions pipeline/process/base/reconciler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pipeline.process.utils.mapper_utils import get_year_from_timespan
from sqlitedict import SqliteDict
from pipeline.storage.idmap.lmdb import TabLmdb
import re


# Abstract class definition, useless without actual data
Expand Down Expand Up @@ -54,6 +55,9 @@ def extract_uris(self, rec):
equivs = rec.get("equivalent", [])
return [x["id"] for x in equivs if "id" in x]

def clean_names(self, name):
return re.sub(r'[\u200b-\u200f\u202a-\u202e]', '', name).lower().strip()

def extract_names(self, rec):
ns = self.configs.external["aat"]["namespace"]
gbls = self.configs.globals_cfg
Expand Down Expand Up @@ -90,7 +94,7 @@ def extract_names(self, rec):
print(f" None in Name classifications: {rec['id']}")

if aat_primaryName in cxnids and "content" in nm:
val = nm["content"].lower().strip()
val = self.clean_names(nm['content'])
for lang_id, num in check_langs.items():
if lang_id in langids:
vals[val] = num
Expand All @@ -108,11 +112,11 @@ def extract_names(self, rec):
for part in parts:
cxns = part.get("classified_as", [])
if aat_firstName in [cx["id"] for cx in cxns]:
first = part["content"].lower().strip()
first = self.clean_names(part['content'])
elif aat_middleName in [cx["id"] for cx in cxns]:
middle = part["content"].lower().strip()
middle = self.clean_names(part['content'])
elif aat_lastName in [cx["id"] for cx in cxns]:
last = part["content"].lower().strip()
last = self.clean_names(part['content'])

if last and first and middle:
vals[f"{last}, {first} {middle}"] = 1
Expand Down