Skip to content

Commit

Permalink
Altered APS parser in order to add ror field.
Browse files Browse the repository at this point in the history
  • Loading branch information
Lorenzovagliano committed Oct 22, 2024
1 parent 5a98322 commit f76c88f
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 25 deletions.
18 changes: 17 additions & 1 deletion dags/aps/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,27 @@ def _form_authors(self, article):
if author["type"] == "Person"
]


def extract_organization_and_ror(self, text):
pattern = r'<a href="([^"]+)">(.*?)</a>'

ror_url = None

def replace_and_capture(match):
nonlocal ror_url
ror_url = match.group(1)
return match.group(2)

modified_text = re.sub(pattern, replace_and_capture, text)

return modified_text, ror_url

def _get_affiliations(self, article, affiliationIds):
parsed_affiliations = [
{
"value": affiliation["name"],
"organization": (",").join(affiliation["name"].split(",")[:-1]),
"organization": self.extract_organization_and_ror(affiliation["name"])[0],
"ror": self.extract_organization_and_ror(affiliation["name"])[1],
}
for affiliation in article["affiliations"]
if affiliation["id"] in affiliationIds
Expand Down
6 changes: 3 additions & 3 deletions tests/units/aps/data/json_response_content.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@
],
"affiliations": [
{
"name": "Department of Physics, University of Oregon, Eugene, Oregon 97403, USA",
"name": "Department of Physics and Astronomy, <a href=\"https://ror.org/02vm5rt34\">Vanderbilt University</a>, Nashville, Tennessee 37240, USA",
"id": "a1"
},
{
"name": "Department of Physics, Tsinghua University, Beijing 100084",
"name": "Department of Physics and Astronomy, <a href=\"https://ror.org/02vm5rt34\">Vanderbilt University</a>, Nashville, Tennessee 37240, USA",
"id": "a2"
}
],
Expand Down Expand Up @@ -179,7 +179,7 @@
],
"affiliations": [
{
"name": "Department of Physics, University of Toronto, Toronto, Ontario, Canada M5S1A7",
"name": "Department of Physics and Astronomy, <a href=\"https://ror.org/02vm5rt34\">Vanderbilt University</a>, Nashville, Tennessee 37240, USA",
"id": "a1"
}
],
Expand Down
42 changes: 21 additions & 21 deletions tests/units/aps/test_aps_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ def parsed_articles(parser, articles):
"surname": "Wu",
"affiliations": [
{
"value": "Department of Physics, University of Oregon, Eugene, Oregon 97403, USA",
"organization": "Department of Physics, University of Oregon, Eugene, Oregon 97403",
# "country": "USA",
"value": "Department of Physics and Astronomy, <a href=\"https://ror.org/02vm5rt34\">Vanderbilt University</a>, Nashville, Tennessee 37240, USA",
"organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA",
"ror": "https://ror.org/02vm5rt34"
}
],
},
Expand All @@ -73,9 +73,9 @@ def parsed_articles(parser, articles):
"surname": "Turner",
"affiliations": [
{
"value": "Department of Physics, University of Oregon, Eugene, Oregon 97403, USA",
"organization": "Department of Physics, University of Oregon, Eugene, Oregon 97403",
# "country": "USA",
"value": "Department of Physics and Astronomy, <a href=\"https://ror.org/02vm5rt34\">Vanderbilt University</a>, Nashville, Tennessee 37240, USA",
"organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA",
"ror": "https://ror.org/02vm5rt34"
}
],
},
Expand All @@ -85,9 +85,9 @@ def parsed_articles(parser, articles):
"surname": "Wang",
"affiliations": [
{
"value": "Department of Physics, University of Oregon, Eugene, Oregon 97403, USA",
"organization": "Department of Physics, University of Oregon, Eugene, Oregon 97403",
# "country": "USA",
"value": "Department of Physics and Astronomy, <a href=\"https://ror.org/02vm5rt34\">Vanderbilt University</a>, Nashville, Tennessee 37240, USA",
"organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA",
"ror": "https://ror.org/02vm5rt34"
}
],
},
Expand All @@ -97,9 +97,9 @@ def parsed_articles(parser, articles):
"surname": "Borel",
"affiliations": [
{
"value": "Department of Physics, Tsinghua University, Beijing 100084",
"organization": "Department of Physics, Tsinghua University",
# "country": "China",
"value": "Department of Physics and Astronomy, <a href=\"https://ror.org/02vm5rt34\">Vanderbilt University</a>, Nashville, Tennessee 37240, USA",
"organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA",
"ror": "https://ror.org/02vm5rt34"
}
],
},
Expand All @@ -111,9 +111,9 @@ def parsed_articles(parser, articles):
"surname": "Boudjada",
"affiliations": [
{
"value": "Department of Physics, University of Toronto, Toronto, Ontario, Canada M5S1A7",
"organization": "Department of Physics, University of Toronto, Toronto, Ontario",
# "country": "Canada",
"value": "Department of Physics and Astronomy, <a href=\"https://ror.org/02vm5rt34\">Vanderbilt University</a>, Nashville, Tennessee 37240, USA",
"organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA",
"ror": "https://ror.org/02vm5rt34"
}
],
},
Expand All @@ -123,9 +123,9 @@ def parsed_articles(parser, articles):
"surname": "Buessen",
"affiliations": [
{
"value": "Department of Physics, University of Toronto, Toronto, Ontario, Canada M5S1A7",
"organization": "Department of Physics, University of Toronto, Toronto, Ontario",
# "country": "Canada",
"value": "Department of Physics and Astronomy, <a href=\"https://ror.org/02vm5rt34\">Vanderbilt University</a>, Nashville, Tennessee 37240, USA",
"organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA",
"ror": "https://ror.org/02vm5rt34"
}
],
},
Expand All @@ -135,9 +135,9 @@ def parsed_articles(parser, articles):
"surname": "Paramekanti",
"affiliations": [
{
"value": "Department of Physics, University of Toronto, Toronto, Ontario, Canada M5S1A7",
"organization": "Department of Physics, University of Toronto, Toronto, Ontario",
# "country": "Canada",
"value": "Department of Physics and Astronomy, <a href=\"https://ror.org/02vm5rt34\">Vanderbilt University</a>, Nashville, Tennessee 37240, USA",
"organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA",
"ror": "https://ror.org/02vm5rt34"
}
],
},
Expand Down

0 comments on commit f76c88f

Please sign in to comment.