Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

common: added countries mapping #191

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dags/aps/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from common.parsing.json_extractors import CustomExtractor, NestedValueExtractor
from common.parsing.parser import IParser
from common.utils import construct_license
from common.utils import construct_license, parse_country_from_value
from inspire_utils.record import get_value
from structlog import get_logger

Expand Down Expand Up @@ -101,7 +101,7 @@ def _get_affiliations(self, article, affiliationIds):
{
"value": affiliation["name"],
"organization": (",").join(affiliation["name"].split(",")[:-1]),
"country": affiliation["name"].split(", ")[-1:][0],
"country": parse_country_from_value(affiliation["name"]),
}
for affiliation in article["affiliations"]
if affiliation["id"] in affiliationIds
Expand Down
177 changes: 177 additions & 0 deletions dags/common/countries_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
COUNTRIES_DEFAULT_MAPPING = {
"INFN": "Italy",
"Democratic People's Republic of Korea": "North Korea",
"Korea, Democratic People's Republic of": "North Korea",
"Korea, Republic of": "South Korea",
"DPR Korea": "North Korea",
"DPR. Korea": "North Korea",
"CERN": "CERN",
"European Organization for Nuclear Research": "CERN",
"Conseil Européen pour la Recherche Nucléaire": "CERN",
"KEK": "Japan",
"DESY": "Germany",
"FERMILAB": "USA",
"FNAL": "USA",
"SLACK": "USA",
"Stanford Linear Accelerator Center": "USA",
"Joint Institute for Nuclear Research": "JINR",
"JINR": "JINR",
"Northern Cyprus": "Turkey",
"North Cyprus": "Turkey",
"New Mexico": "USA",
"South China Normal University": "China",
"Hong Kong China": "Hong Kong",
"Hong-Kong China": "Hong Kong",
"Hong Kong, China": "Hong Kong",
"Hong Kong": "Hong Kong",
"Hong-Kong": "Hong Kong",
"Algeria": "Algeria",
"Argentina": "Argentina",
"Armenia": "Armenia",
"Australia": "Australia",
"Austria": "Austria",
"Azerbaijan": "Azerbaijan",
"Belarus": "Belarus",
"Belgium": "Belgium",
"Belgique": "Belgium",
"Bangladesh": "Bangladesh",
"Brazil": "Brazil",
"Brasil": "Brazil",
"Benin": "Benin",
"Bulgaria": "Bulgaria",
"Bosnia and Herzegovina": "Bosnia and Herzegovina",
"Canada": "Canada",
"Chile": "Chile",
"ROC": "Taiwan",
"R.O.C": "Taiwan",
"Republic of China": "Taiwan",
"China (PRC)": "China",
"PR China": "China",
"China": "China",
"People's Republic of China": "China",
"Republic of China": "China",
"Colombia": "Colombia",
"Costa Rica": "Costa Rica",
"Cuba": "Cuba",
"Croatia": "Croatia",
"Cyprus": "Cyprus",
"Czech Republic": "Czech Republic",
"Czech": "Czech Republic",
"Czechia": "Czech Republic",
"Denmark": "Denmark",
"Egypt": "Egypt",
"Estonia": "Estonia",
"Ecuador": "Ecuador",
"Finland": "Finland",
"France": "France",
"Germany": "Germany",
"Deutschland": "Germany",
"Greece": "Greece",
"Hungary": "Hungary",
"Iceland": "Iceland",
"India": "India",
"Indonesia": "Indonesia",
"Iran": "Iran",
"Ireland": "Ireland",
"Israel": "Israel",
"Italy": "Italy",
"Italia": "Italy",
"Japan": "Japan",
"Jamaica": "Jamaica",
"Korea": "South Korea",
"Republic of Korea": "South Korea",
"South Korea": "South Korea",
"Latvia": "Latvia",
"Lebanon": "Lebanon",
"Lithuania": "Lithuania",
"Luxembourg": "Luxembourg",
"Macedonia": "Macedonia",
"Mexico": "Mexico",
"Monaco": "Monaco",
"Montenegro": "Montenegro",
"Morocco": "Morocco",
"Niger": "Niger",
"Nigeria": "Nigeria",
"Netherlands": "Netherlands",
"The Netherlands": "Netherlands",
"New Zealand": "New Zealand",
"Zealand": "New Zealand",
"Norway": "Norway",
"Oman": "Oman",
"Sultanate of Oman": "Oman",
"Pakistan": "Pakistan",
"Panama": "Panama",
"Philipines": "Philipines",
"Poland": "Poland",
"Portugalo": "Portugal",
"Portugal": "Portugal",
"P.R.China": "China",
"People’s Republic of China": "China",
"Republic of Belarus": "Belarus",
"Republic of Benin": "Benin",
"Republic of Korea": "South Korea",
"Republic of San Marino": "San Marino",
"Republic of South Africa": "South Africa",
"Romania": "Romania",
"Russia": "Russia",
"Russian Federation": "Russia",
"Saudi Arabia": "Saudi Arabia",
"Kingdom of Saudi Arabia": "Saudi Arabia",
"Arabia": "Saudi Arabia",
"Serbia": "Serbia",
"Singapore": "Singapore",
"Slovak Republic": "Slovakia",
"Slovak": "Slovakia",
"Slovakia": "Slovakia",
"Slovenia": "Slovenia",
"South Africa": "South Africa",
"Africa": "South Africa",
"España": "Spain",
"Spain": "Spain",
"Sudan": "Sudan",
"Sweden": "Sweden",
"Switzerland": "Switzerland",
"Syria": "Syria",
"Taiwan": "Taiwan",
"Thailand": "Thailand",
"Tunisia": "Tunisia",
"Turkey": "Turkey",
"Ukraine": "Ukraine",
"United Kingdom": "UK",
"Kingdom": "UK",
"United Kingdom of Great Britain and Northern Ireland": "UK",
"UK": "UK",
"England": "UK",
"Scotland": "UK",
"Wales": "UK",
"New South Wales": "Australia",
"U.K": "UK",
"United States of America": "USA",
"United States": "USA",
"USA": "USA",
"U.S.A": "USA",
"America": "USA",
"Uruguay": "Uruguay",
"Uzbekistan": "Uzbekistan",
"Venezuela": "Venezuela",
"Vietnam": "Vietnam",
"Viet Nam": "Vietnam",
"Yemen": "Yemen",
"Peru": "Peru",
"Kuwait": "Kuwait",
"Sri Lanka": "Sri Lanka",
"Lanka": "Sri Lanka",
"Kazakhstan": "Kazakhstan",
"Mongolia": "Mongolia",
"United Arab Emirates": "United Arab Emirates",
"Emirates": "United Arab Emirates",
"Malaysia": "Malaysia",
"Qatar": "Qatar",
"Kyrgyz Republic": "Kyrgyz Republic",
"Jordan": "Jordan",
"Belgrade": "Serbia",
"Istanbul": "Turkey",
"Ankara": "Turkey",
"Rome": "Italy",
"Georgia": "Georgia",
}
7 changes: 7 additions & 0 deletions dags/common/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,10 @@ def __init__(self, license):
class EmptyOutputFromPreviousTask(Exception):
def __init__(self, taks_name):
super().__init__(f"The output from previous task is empty: {taks_name}")


class FoundMoreThanOneMatchOrNone(Exception):
def __init__(self, country_value):
super().__init__(
f"Found more than one or zero match for a country: {country_value}"
)
2 changes: 0 additions & 2 deletions dags/common/parsing/generic_parsing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import re
from datetime import date


def take_first(arr):
try:
return next(filter(None, arr))
Expand Down
26 changes: 25 additions & 1 deletion dags/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,23 @@
from stat import S_ISDIR, S_ISREG

import backoff
import pycountry
import requests
from airflow.models.dagrun import DagRun
from airflow.utils.state import DagRunState
from common.constants import (
BY_PATTERN,
CDATA_PATTERN,
COUNTRY_PARSING_PATTERN,
CREATIVE_COMMONS_PATTERN,
LICENSE_PATTERN,
)
from common.exceptions import UnknownFileExtension, UnknownLicense
from common.countries_mapping import COUNTRIES_DEFAULT_MAPPING
from common.exceptions import (
FoundMoreThanOneMatchOrNone,
UnknownFileExtension,
UnknownLicense,
)
from structlog import get_logger

logger = get_logger()
Expand Down Expand Up @@ -255,3 +262,20 @@ def create_or_update_article(data):
)
response.raise_for_status()
return response.json()


def parse_country_from_value(affiliation_value):
country = COUNTRY_PARSING_PATTERN.search(affiliation_value).group(0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need the COUNTRIES_DEFAULT_MAPPING if we use pycountry?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, because pycountry there are cases when it gives more than one country, for example:

In [3]: pycountry.countries.search_fuzzy("USA")
Out[3]: 
[Country(alpha_2='US', alpha_3='USA', flag='🇺🇸', name='United States', numeric='840', official_name='United States of America'),
 Country(alpha_2='ID', alpha_3='IDN', flag='🇮🇩', name='Indonesia', numeric='360', official_name='Republic of Indonesia'),
 Country(alpha_2='AZ', alpha_3='AZE', flag='🇦🇿', name='Azerbaijan', numeric='031', official_name='Republic of Azerbaijan'),
 Country(alpha_2='PH', alpha_3='PHL', flag='🇵🇭', name='Philippines', numeric='608', official_name='Republic of the Philippines'),
 Country(alpha_2='TR', alpha_3='TUR', flag='🇹🇷', name='Turkey', numeric='792', official_name='Republic of Turkey'),
 Country(alpha_2='KR', alpha_3='KOR', common_name='South Korea', flag='🇰🇷', name='Korea, Republic of', numeric='410'),
 Country(alpha_2='OM', alpha_3='OMN', flag='🇴🇲', name='Oman', numeric='512', official_name='Sultanate of Oman'),
 Country(alpha_2='ZM', alpha_3='ZMB', flag='🇿🇲', name='Zambia', numeric='894', official_name='Republic of Zambia'),
 Country(alpha_2='EE', alpha_3='EST', flag='🇪🇪', name='Estonia', numeric='233', official_name='Republic of Estonia'),
 Country(alpha_2='IT', alpha_3='ITA', flag='🇮🇹', name='Italy', numeric='380', official_name='Italian Republic'),
 Country(alpha_2='KH', alpha_3='KHM', flag='🇰🇭', name='Cambodia', numeric='116', official_name='Kingdom of Cambodia'),
 Country(alpha_2='NA', alpha_3='NAM', flag='🇳🇦', name='Namibia', numeric='516', official_name='Republic of Namibia'),
 Country(alpha_2='PS', alpha_3='PSE', flag='🇵🇸', name='Palestine, State of', numeric='275', official_name='the State of Palestine')]

try:
mapped_countries = pycountry.countries.search_fuzzy(country)
if len(mapped_countries) > 1 or len(mapped_countries) == 0:
raise FoundMoreThanOneMatchOrNone(affiliation_value)
return mapped_countries[0].name
except:
return find_country_match_from_mapping(affiliation_value)


def find_country_match_from_mapping(affiliation_value):
for key in COUNTRIES_DEFAULT_MAPPING:
if re.search(r"\b%s\b" % key, affiliation_value, flags=re.IGNORECASE):
return COUNTRIES_DEFAULT_MAPPING[key]
4 changes: 3 additions & 1 deletion dags/elsevier/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
CustomExtractor,
TextExtractor,
)
from common.utils import extract_text
from common.utils import extract_text, parse_country_from_value
from structlog import get_logger


Expand Down Expand Up @@ -185,6 +185,7 @@ def _get_affiliation(self, article, ref_id="", affiliations=[]):
field_name="country",
dois=self.dois,
)
country = country and parse_country_from_value(country)
if affiliation_value and organization and country:
affiliations.append(
{
Expand All @@ -204,6 +205,7 @@ def _get_affiliation(self, article, ref_id="", affiliations=[]):
affiliations.append(
{
"value": affiliation_value,
"country": parse_country_from_value(affiliation_value),
}
)

Expand Down
5 changes: 3 additions & 2 deletions dags/hindawi/parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import re
import xml.etree.ElementTree as ET

from common.constants import COUNTRY_PARSING_PATTERN, ORGANIZATION_PARSING_PATTERN
from common.constants import ORGANIZATION_PARSING_PATTERN
from common.parsing.parser import IParser
from common.parsing.xml_extractors import ConstantExtractor, CustomExtractor
from common.utils import parse_country_from_value
from hindawi.xml_extractors import HindawiTextExtractor as TextExtractor
from structlog import get_logger

Expand Down Expand Up @@ -121,7 +122,7 @@ def _get_affiliations(self, author):
{
"value": affiliation.text,
"organization": ORGANIZATION_PARSING_PATTERN.sub("", affiliation.text),
"country": COUNTRY_PARSING_PATTERN.search(affiliation.text).group(0),
"country": parse_country_from_value(affiliation.text),
}
for affiliation in affiliations
]
Expand Down
6 changes: 5 additions & 1 deletion dags/iop/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
get_license_type,
get_license_type_and_version_from_url,
parse_to_int,
parse_country_from_value
)
from idutils import is_arxiv
from inspire_utils.date import PartialDate
Expand Down Expand Up @@ -303,12 +304,15 @@ def _get_institution(self, article, id):
)

def _get_country(self, article, id):
return extract_text(
country = extract_text(
article=article,
path=f"front/article-meta/contrib-group/aff[@id='{id}']/country",
field_name="country",
dois=self.dois,
)
if not country:
return
return parse_country_from_value(country)

def _extract_copyright_year(self, article):
return extract_text(
Expand Down
8 changes: 4 additions & 4 deletions dags/springer/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
CustomExtractor,
TextExtractor,
)
from common.utils import construct_license
from common.utils import construct_license, parse_country_from_value
from structlog import get_logger


Expand Down Expand Up @@ -171,12 +171,12 @@ def _clean_aff(self, article: ET.Element):
city_node,
state_node,
postcode_node,
country_node,
]
if node is not None
]

return ", ".join(result), org_name_node.text, country_node.text
country = parse_country_from_value(country_node.text)
result.append(country)
return ", ".join(result), org_name_node.text, country

def _get_published_date(self, article: ET.Element):
year = article.find(
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ busypie==0.4.5
pydantic==1.10.7
jsonschema==4.17.3
plyvel==1.5.0
pycountry==22.3.5
2 changes: 1 addition & 1 deletion tests/integration/elsevier/test_elsevier_dag_pull_sftp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from common.repository import IRepository
from elsevier.repository import ElsevierRepository
from elsevier.sftp_service import ElsevierSFTPService
from structlog import get_logger
from pytest import fixture
from structlog import get_logger

DAG_NAME = "elsevier_pull_sftp"

Expand Down
4 changes: 2 additions & 2 deletions tests/integration/springer/test_springer_dag_process_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,9 +236,9 @@ def test_dag_validate_file_pass(article):
"email": "[email protected]",
"affiliations": [
{
"value": "School of Physics, Korea Institute for Advanced Study, Dongdaemun-gu, Seoul, 02455, Korea",
"value": "School of Physics, Korea Institute for Advanced Study, Dongdaemun-gu, Seoul, 02455, South Korea",
"organization": "School of Physics, Korea Institute for Advanced Study",
"country": "Korea",
"country": "South Korea",
}
],
"full_name": "Nosaka, Tomoki",
Expand Down
Loading