From 4fb2e158810232cc4f65c5df430fb1dc31ec81e0 Mon Sep 17 00:00:00 2001 From: "Milena T. Bagdasarian" Date: Tue, 3 Sep 2024 14:00:03 +0200 Subject: [PATCH] check for HTML Tag and remove them for LaTeX --- data_extraction/latex/build_latex.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/data_extraction/latex/build_latex.py b/data_extraction/latex/build_latex.py index 9651e9a..c0b7723 100644 --- a/data_extraction/latex/build_latex.py +++ b/data_extraction/latex/build_latex.py @@ -7,6 +7,7 @@ import io from decimal import Decimal import shutil +import re methodsdir = "../../methods" imagedir = "../../project-page/static/images/" @@ -208,6 +209,9 @@ def extract_title_and_text(markdown: str): # Who puts hashtags in a title anyway? title = lines[0].replace("#", "").strip() text = "\n".join(lines[1:]).strip() + # check for html + clean = re.compile("<.*?>") + text = re.sub(clean, "", text) return title, text