From f76c88f837dd16a38064627e3b3dec45de5904bc Mon Sep 17 00:00:00 2001 From: Lorenzo Vagliano Date: Thu, 10 Oct 2024 17:07:04 +0200 Subject: [PATCH] Altered APS parser in order to add ror field. --- dags/aps/parser.py | 18 +++++++- .../units/aps/data/json_response_content.json | 6 +-- tests/units/aps/test_aps_parser.py | 42 +++++++++---------- 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/dags/aps/parser.py b/dags/aps/parser.py index 6d10b10a..c46d4506 100644 --- a/dags/aps/parser.py +++ b/dags/aps/parser.py @@ -97,11 +97,27 @@ def _form_authors(self, article): if author["type"] == "Person" ] + + def extract_organization_and_ror(self, text): + pattern = r'(.*?)' + + ror_url = None + + def replace_and_capture(match): + nonlocal ror_url + ror_url = match.group(1) + return match.group(2) + + modified_text = re.sub(pattern, replace_and_capture, text) + + return modified_text, ror_url + def _get_affiliations(self, article, affiliationIds): parsed_affiliations = [ { "value": affiliation["name"], - "organization": (",").join(affiliation["name"].split(",")[:-1]), + "organization": self.extract_organization_and_ror(affiliation["name"])[0], + "ror": self.extract_organization_and_ror(affiliation["name"])[1], } for affiliation in article["affiliations"] if affiliation["id"] in affiliationIds diff --git a/tests/units/aps/data/json_response_content.json b/tests/units/aps/data/json_response_content.json index a58ff323..5460fa86 100644 --- a/tests/units/aps/data/json_response_content.json +++ b/tests/units/aps/data/json_response_content.json @@ -38,11 +38,11 @@ ], "affiliations": [ { - "name": "Department of Physics, University of Oregon, Eugene, Oregon 97403, USA", + "name": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", "id": "a1" }, { - "name": "Department of Physics, Tsinghua University, Beijing 100084", + "name": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", "id": "a2" } ], @@ -179,7 +179,7 @@ ], "affiliations": [ { - "name": "Department of Physics, University of Toronto, Toronto, Ontario, Canada M5S1A7", + "name": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", "id": "a1" } ], diff --git a/tests/units/aps/test_aps_parser.py b/tests/units/aps/test_aps_parser.py index 6a620148..987ba470 100644 --- a/tests/units/aps/test_aps_parser.py +++ b/tests/units/aps/test_aps_parser.py @@ -61,9 +61,9 @@ def parsed_articles(parser, articles): "surname": "Wu", "affiliations": [ { - "value": "Department of Physics, University of Oregon, Eugene, Oregon 97403, USA", - "organization": "Department of Physics, University of Oregon, Eugene, Oregon 97403", - # "country": "USA", + "value": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "ror": "https://ror.org/02vm5rt34" } ], }, @@ -73,9 +73,9 @@ def parsed_articles(parser, articles): "surname": "Turner", "affiliations": [ { - "value": "Department of Physics, University of Oregon, Eugene, Oregon 97403, USA", - "organization": "Department of Physics, University of Oregon, Eugene, Oregon 97403", - # "country": "USA", + "value": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "ror": "https://ror.org/02vm5rt34" } ], }, @@ -85,9 +85,9 @@ def parsed_articles(parser, articles): "surname": "Wang", "affiliations": [ { - "value": "Department of Physics, University of Oregon, Eugene, Oregon 97403, USA", - "organization": "Department of Physics, University of Oregon, Eugene, Oregon 97403", - # "country": "USA", + "value": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "ror": "https://ror.org/02vm5rt34" } ], }, @@ -97,9 +97,9 @@ def parsed_articles(parser, articles): "surname": "Borel", "affiliations": [ { - "value": "Department of Physics, Tsinghua University, Beijing 100084", - "organization": "Department of Physics, Tsinghua University", - # "country": "China", + "value": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "ror": "https://ror.org/02vm5rt34" } ], }, @@ -111,9 +111,9 @@ def parsed_articles(parser, articles): "surname": "Boudjada", "affiliations": [ { - "value": "Department of Physics, University of Toronto, Toronto, Ontario, Canada M5S1A7", - "organization": "Department of Physics, University of Toronto, Toronto, Ontario", - # "country": "Canada", + "value": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "ror": "https://ror.org/02vm5rt34" } ], }, @@ -123,9 +123,9 @@ def parsed_articles(parser, articles): "surname": "Buessen", "affiliations": [ { - "value": "Department of Physics, University of Toronto, Toronto, Ontario, Canada M5S1A7", - "organization": "Department of Physics, University of Toronto, Toronto, Ontario", - # "country": "Canada", + "value": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "ror": "https://ror.org/02vm5rt34" } ], }, @@ -135,9 +135,9 @@ def parsed_articles(parser, articles): "surname": "Paramekanti", "affiliations": [ { - "value": "Department of Physics, University of Toronto, Toronto, Ontario, Canada M5S1A7", - "organization": "Department of Physics, University of Toronto, Toronto, Ontario", - # "country": "Canada", + "value": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "organization": "Department of Physics and Astronomy, Vanderbilt University, Nashville, Tennessee 37240, USA", + "ror": "https://ror.org/02vm5rt34" } ], },