Skip to content

Commit

Permalink
common(formatting): fix linting, formatting errors
Browse files Browse the repository at this point in the history
Signed-off-by: pamfilos <[email protected]>
  • Loading branch information
pamfilos committed Dec 5, 2024
1 parent 109223c commit 9980e05
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 26 deletions.
13 changes: 6 additions & 7 deletions dags/common/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,27 @@ def convert_html_subscripts_to_latex(input):
input = re.sub("<sup>(.*?)</sup>", r"$^{\1}$", input)
return input


def clean_inline_expressions(input):
input = re.sub(
r"<InlineEquation.*?>(.*?)</InlineEquation>",
r"\1",
input,
flags=re.DOTALL
r"<InlineEquation.*?>(.*?)</InlineEquation>", r"\1", input, flags=re.DOTALL
)
input = re.sub(
r"<EquationSource Format=\"TEX\"><!\[CDATA\[(.*?)\]\]></EquationSource>",
r"\1",
input
input,
)
input = re.sub(
r"<EquationSource Format=\"MATHML\">.*?</EquationSource>",
"",
input,
flags=re.DOTALL
flags=re.DOTALL,
)
input = input.replace('\n', '').replace('\r', '')
input = input.replace("\n", "").replace("\r", "")

return input


def convert_html_italics_to_latex(input):
input = re.sub(r"<italic\b[^>]*>(.*?)</italic>", r"$\\textit{\1}$", input)
return input
Expand Down
20 changes: 11 additions & 9 deletions dags/springer/springer_process_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@
import pendulum
import requests
from airflow.decorators import dag, task
from common.cleanup import (
clean_inline_expressions,
clean_whitespace_characters,
convert_html_italics_to_latex,
convert_html_subscripts_to_latex,
replace_cdata_format,
)
from common.enhancer import Enhancer
from common.enricher import Enricher
from common.exceptions import EmptyOutputFromPreviousTask
from common.scoap3_s3 import Scoap3Repository
from common.utils import create_or_update_article, upload_json_to_s3
from common.cleanup import (
replace_cdata_format,
convert_html_subscripts_to_latex,
convert_html_italics_to_latex,
clean_whitespace_characters,
clean_inline_expressions,
)
from inspire_utils.record import get_value
from jsonschema import validate
from springer.parser import SpringerParser
Expand All @@ -24,22 +24,24 @@

logger = get_logger()


def process_xml(input):
input = convert_html_subscripts_to_latex(input)
input = convert_html_italics_to_latex(input)
input = replace_cdata_format(input)
input = clean_inline_expressions(input)
input = input.replace('\n', '').replace('\r', '').lstrip().rstrip()
input = input.replace("\n", "").replace("\r", "").lstrip().rstrip()
input = clean_whitespace_characters(input.strip())
return input


def springer_parse_file(**kwargs):
if "params" in kwargs and "file" in kwargs["params"]:
encoded_xml = kwargs["params"]["file"]
file_name = kwargs["params"]["file_name"]
xml_bytes = base64.b64decode(encoded_xml)
if isinstance(xml_bytes, bytes):
xml_bytes = xml_bytes.decode('utf-8')
xml_bytes = xml_bytes.decode("utf-8")
xml_bytes = process_xml(xml_bytes)
xml = ET.fromstring(xml_bytes.decode("utf-8"))

Expand Down
3 changes: 2 additions & 1 deletion tests/units/aps/test_aps_parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import json

import pytest
from aps.parser import APSParser
from aps.aps_process_file import enhance_aps
from aps.parser import APSParser


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -203,6 +203,7 @@ def test_aps_parsing(parsed_articles, expected, key):
assert key in article
assert article[key] == expected_value


def test_aps_country_parsing(parsed_articles):
for article in parsed_articles:
enhance_aps(article)
18 changes: 10 additions & 8 deletions tests/units/springer/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,17 @@ def parsed_articles(parser, articles):

def test_weird_titles(parsed_articles):
parsed_titles = sorted([a.get("title") for a in parsed_articles])
expected_results = sorted([
" $$(g-2)_{e,\\mu }$$ anomalies and decays $$h\\rightarrow e_a e_b$$ , "
expected_results = sorted(
[
" $$(g-2)_{e,\\mu }$$ anomalies and decays $$h\\rightarrow e_a e_b$$ , "
"$$Z\\rightarrow e_ae_b$$ , and $$e_b\\rightarrow e_a \\gamma $$ in a two "
"Higgs doublet model with inverse seesaw neutrinos",
" $$\\Lambda $$ polarization in very high energy heavy ion collisions as a probe of the quark–gluon plasma formation and properties",
"A strategy for a general search for new phenomena using data-derived signal regions and its application within the ATLAS experiment",
"Revisiting the mechanical properties of the nucleon",
"Symmetry breaking in quantum curves and super Chern-Simons matrix models"
])
" $$\\Lambda $$ polarization in very high energy heavy ion collisions as a probe of the quark–gluon plasma formation and properties",
"A strategy for a general search for new phenomena using data-derived signal regions and its application within the ATLAS experiment",
"Revisiting the mechanical properties of the nucleon",
"Symmetry breaking in quantum curves and super Chern-Simons matrix models",
]
)

assert expected_results == parsed_titles

Expand Down Expand Up @@ -351,7 +353,7 @@ def test_abstract(parsed_articles):
"experimental data of $$(g-2)_{e,\\mu }$$ as well as the "
"promising LFV signals corresponding to the future experimental "
"sensitivities.",
None
None,
)
for abstract, article in zip(abstracts, parsed_articles):
if abstract is None:
Expand Down
2 changes: 1 addition & 1 deletion tests/units/springer/test_parser/weird.title.Meta
Original file line number Diff line number Diff line change
Expand Up @@ -220,4 +220,4 @@
</Issue>
</Volume>
</Journal>
</Publisher>
</Publisher>

0 comments on commit 9980e05

Please sign in to comment.