From 9980e0568a83162d044b99d8a53483c4348eaef0 Mon Sep 17 00:00:00 2001 From: pamfilos Date: Thu, 5 Dec 2024 15:35:28 +0100 Subject: [PATCH] common(formatting): fix linting, formatting errors Signed-off-by: pamfilos --- dags/common/cleanup.py | 13 ++++++------ dags/springer/springer_process_file.py | 20 ++++++++++--------- tests/units/aps/test_aps_parser.py | 3 ++- tests/units/springer/test_parser.py | 18 +++++++++-------- .../springer/test_parser/weird.title.Meta | 2 +- 5 files changed, 30 insertions(+), 26 deletions(-) diff --git a/dags/common/cleanup.py b/dags/common/cleanup.py index 229351de..595141ed 100644 --- a/dags/common/cleanup.py +++ b/dags/common/cleanup.py @@ -15,28 +15,27 @@ def convert_html_subscripts_to_latex(input): input = re.sub("(.*?)", r"$^{\1}$", input) return input + def clean_inline_expressions(input): input = re.sub( - r"(.*?)", - r"\1", - input, - flags=re.DOTALL + r"(.*?)", r"\1", input, flags=re.DOTALL ) input = re.sub( r"", r"\1", - input + input, ) input = re.sub( r".*?", "", input, - flags=re.DOTALL + flags=re.DOTALL, ) - input = input.replace('\n', '').replace('\r', '') + input = input.replace("\n", "").replace("\r", "") return input + def convert_html_italics_to_latex(input): input = re.sub(r"]*>(.*?)", r"$\\textit{\1}$", input) return input diff --git a/dags/springer/springer_process_file.py b/dags/springer/springer_process_file.py index 97f446df..6c173eee 100644 --- a/dags/springer/springer_process_file.py +++ b/dags/springer/springer_process_file.py @@ -4,18 +4,18 @@ import pendulum import requests from airflow.decorators import dag, task +from common.cleanup import ( + clean_inline_expressions, + clean_whitespace_characters, + convert_html_italics_to_latex, + convert_html_subscripts_to_latex, + replace_cdata_format, +) from common.enhancer import Enhancer from common.enricher import Enricher from common.exceptions import EmptyOutputFromPreviousTask from common.scoap3_s3 import Scoap3Repository from common.utils import create_or_update_article, upload_json_to_s3 -from common.cleanup import ( - replace_cdata_format, - convert_html_subscripts_to_latex, - convert_html_italics_to_latex, - clean_whitespace_characters, - clean_inline_expressions, -) from inspire_utils.record import get_value from jsonschema import validate from springer.parser import SpringerParser @@ -24,22 +24,24 @@ logger = get_logger() + def process_xml(input): input = convert_html_subscripts_to_latex(input) input = convert_html_italics_to_latex(input) input = replace_cdata_format(input) input = clean_inline_expressions(input) - input = input.replace('\n', '').replace('\r', '').lstrip().rstrip() + input = input.replace("\n", "").replace("\r", "").lstrip().rstrip() input = clean_whitespace_characters(input.strip()) return input + def springer_parse_file(**kwargs): if "params" in kwargs and "file" in kwargs["params"]: encoded_xml = kwargs["params"]["file"] file_name = kwargs["params"]["file_name"] xml_bytes = base64.b64decode(encoded_xml) if isinstance(xml_bytes, bytes): - xml_bytes = xml_bytes.decode('utf-8') + xml_bytes = xml_bytes.decode("utf-8") xml_bytes = process_xml(xml_bytes) xml = ET.fromstring(xml_bytes.decode("utf-8")) diff --git a/tests/units/aps/test_aps_parser.py b/tests/units/aps/test_aps_parser.py index b7650dfe..9986f764 100644 --- a/tests/units/aps/test_aps_parser.py +++ b/tests/units/aps/test_aps_parser.py @@ -1,8 +1,8 @@ import json import pytest -from aps.parser import APSParser from aps.aps_process_file import enhance_aps +from aps.parser import APSParser @pytest.fixture(scope="module") @@ -203,6 +203,7 @@ def test_aps_parsing(parsed_articles, expected, key): assert key in article assert article[key] == expected_value + def test_aps_country_parsing(parsed_articles): for article in parsed_articles: enhance_aps(article) diff --git a/tests/units/springer/test_parser.py b/tests/units/springer/test_parser.py index 3cebb4f0..9a47990c 100644 --- a/tests/units/springer/test_parser.py +++ b/tests/units/springer/test_parser.py @@ -30,15 +30,17 @@ def parsed_articles(parser, articles): def test_weird_titles(parsed_articles): parsed_titles = sorted([a.get("title") for a in parsed_articles]) - expected_results = sorted([ - " $$(g-2)_{e,\\mu }$$ anomalies and decays $$h\\rightarrow e_a e_b$$ , " + expected_results = sorted( + [ + " $$(g-2)_{e,\\mu }$$ anomalies and decays $$h\\rightarrow e_a e_b$$ , " "$$Z\\rightarrow e_ae_b$$ , and $$e_b\\rightarrow e_a \\gamma $$ in a two " "Higgs doublet model with inverse seesaw neutrinos", - " $$\\Lambda $$ polarization in very high energy heavy ion collisions as a probe of the quark–gluon plasma formation and properties", - "A strategy for a general search for new phenomena using data-derived signal regions and its application within the ATLAS experiment", - "Revisiting the mechanical properties of the nucleon", - "Symmetry breaking in quantum curves and super Chern-Simons matrix models" - ]) + " $$\\Lambda $$ polarization in very high energy heavy ion collisions as a probe of the quark–gluon plasma formation and properties", + "A strategy for a general search for new phenomena using data-derived signal regions and its application within the ATLAS experiment", + "Revisiting the mechanical properties of the nucleon", + "Symmetry breaking in quantum curves and super Chern-Simons matrix models", + ] + ) assert expected_results == parsed_titles @@ -351,7 +353,7 @@ def test_abstract(parsed_articles): "experimental data of $$(g-2)_{e,\\mu }$$ as well as the " "promising LFV signals corresponding to the future experimental " "sensitivities.", - None + None, ) for abstract, article in zip(abstracts, parsed_articles): if abstract is None: diff --git a/tests/units/springer/test_parser/weird.title.Meta b/tests/units/springer/test_parser/weird.title.Meta index 31607de5..158b942c 100644 --- a/tests/units/springer/test_parser/weird.title.Meta +++ b/tests/units/springer/test_parser/weird.title.Meta @@ -220,4 +220,4 @@ - \ No newline at end of file +