Skip to content

Commit

Permalink
fix: remove nbsp and social links from html imports
Browse files Browse the repository at this point in the history
  • Loading branch information
brownben committed Dec 5, 2024
1 parent 977196e commit a5abe7a
Showing 1 changed file with 13 additions and 2 deletions.
15 changes: 13 additions & 2 deletions backend/src/utils/import_file/import_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,30 @@
SITIMING_PARSE_FAILURE_MESSAGE = (
"Data not as expected for SITiming HTML file, please try another format."
)
NBSP_REGEX = re.compile(r" |\\\\u0026nbsp;", flags=re.IGNORECASE)
NBSP_REGEX = re.compile(" ", flags=re.IGNORECASE)
SOCIAL_LINK_REGEX = re.compile("<a href=.*?></a>", flags=re.IGNORECASE)


def parse_sitiming_script(script_tag_text: str) -> list[str]:
# remove preamble of the function definition
preamble_regex = re.compile(r".*?0\)\s*\n*\s*return", flags=re.DOTALL)
script_text = preamble_regex.sub("", script_tag_text)

# replace various escaped html entities
script_text = (
script_text.replace("\\u003c", "<")
.replace("\\u003e", ">")
.replace("\\u0026nbsp;", "")
)

# remove the end of the function, which isn't data
script_text = script_text.strip().removesuffix("}").removesuffix(";")

# replace &nbsp; with real spaces
script_text = NBSP_REGEX.sub(" ", script_text)
script_text = NBSP_REGEX.sub("", script_text)

# remove the links to social media
script_text = SOCIAL_LINK_REGEX.sub("", script_text)

# split into the blocks of JSON
IF_RETURN = r";\n*\s*if\s*\(tableNumber == [0-9]+\)\n*\s*return\s*\n*"
Expand Down

0 comments on commit a5abe7a

Please sign in to comment.