Skip to content

Commit

Permalink
dags: enhance - fix country issue
Browse files Browse the repository at this point in the history
Signed-off-by: pamfilos <[email protected]>
  • Loading branch information
pamfilos committed Apr 24, 2024
1 parent 1282c86 commit b56c1e1
Show file tree
Hide file tree
Showing 6 changed files with 580 additions and 6 deletions.
17 changes: 17 additions & 0 deletions dags/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,20 @@
("Georgia", "Georgia"),
]
)

INSTITUTIONS_AND_COUNTRIES_MAPPING = OrderedDict([
("INFN", "Italy"),
("European Organization for Nuclear Research", "CERN"),
("Conseil Européen pour la Recherche Nucléaire", "CERN"),
("CERN", "CERN"),
("KEK", "Japan"),
("DESY", "Germany"),
("FERMILAB", "USA"),
("FNAL", "USA"),
("SLACK", "USA"),
("Stanford Linear Accelerator Center", "USA"),
("Joint Institute for Nuclear Research", "JINR"),
("JINR", "JINR"),
("ROC", "Taiwan"),
("R.O.C", "Taiwan"),
])
2 changes: 1 addition & 1 deletion dags/common/enhancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def __construct_authors(self, item):

if not affiliation.get("country"):
_parsed_country = parse_country_from_value(affiliation.get("value"))
if _parsed_country:
if _parsed_country is not None:
affiliation["country"] = _parsed_country

if affiliation.get("country"):
Expand Down
16 changes: 13 additions & 3 deletions dags/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@
CREATIVE_COMMONS_PATTERN,
LICENSE_PATTERN,
)
from common.constants import COUNTRIES_DEFAULT_MAPPING
from common.constants import (
COUNTRIES_DEFAULT_MAPPING,
INSTITUTIONS_AND_COUNTRIES_MAPPING,
)
from common.exceptions import (
FoundMoreThanOneMatchOrNone,
UnknownFileExtension,
Expand Down Expand Up @@ -274,13 +277,20 @@ def create_or_update_article(data):


def parse_country_from_value(affiliation_value):
country = COUNTRY_PARSING_PATTERN.search(affiliation_value).group(0)
for key, val in INSTITUTIONS_AND_COUNTRIES_MAPPING.items():
if re.search(r'\b%s\b' % key, affiliation_value, flags=re.IGNORECASE):
return val
country = affiliation_value.split(",")[-1].strip()
for key, val in COUNTRIES_DEFAULT_MAPPING.items():
if re.search(r'\b%s\b' % key, country, flags=re.IGNORECASE):
return val

try:
mapped_countries = pycountry.countries.search_fuzzy(country)
if len(mapped_countries) > 1 or len(mapped_countries) == 0:
raise FoundMoreThanOneMatchOrNone(affiliation_value)
return mapped_countries[0].name
except FoundMoreThanOneMatchOrNone:
except (LookupError, FoundMoreThanOneMatchOrNone):
return find_country_match_from_mapping(affiliation_value)


Expand Down
2 changes: 1 addition & 1 deletion dags/elsevier/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
CustomExtractor,
TextExtractor,
)
from common.utils import extract_text, parse_country_from_value
from common.utils import extract_text
from structlog import get_logger


Expand Down
2 changes: 1 addition & 1 deletion dags/springer/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
CustomExtractor,
TextExtractor,
)
from common.utils import construct_license, parse_country_from_value
from common.utils import construct_license
from structlog import get_logger


Expand Down
Loading

0 comments on commit b56c1e1

Please sign in to comment.