From e535721a4774e69cabdf1a56093ad48027dd5b29 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 24 Oct 2023 09:34:25 +0800 Subject: [PATCH] Allow matching HTML tags that have XML attribute Add ":" to the HTML tag regex pattern and fix a regex flag error in the `inside_html_tags_re` variable. --- src/wikitextprocessor/parser.py | 9 +++++---- tests/test_parser.py | 15 ++++++++++++++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/wikitextprocessor/parser.py b/src/wikitextprocessor/parser.py index e27f6205..e7988ca3 100644 --- a/src/wikitextprocessor/parser.py +++ b/src/wikitextprocessor/parser.py @@ -285,7 +285,8 @@ class NodeKind(enum.Flag): # This means that if you have nesting capturing groups, # the contents will be repeated partly. inside_html_tags_re = re.compile( - r"(<(?i:" + r"|".join(ALLOWED_HTML_TAGS.keys()) + r")\s+[^><]*>)" + r"(<(?:" + r"|".join(ALLOWED_HTML_TAGS.keys()) + r")[^><]*>)", + re.IGNORECASE ) # We don't have specs for this, so let's assume... @@ -1738,7 +1739,7 @@ def tag_fn(ctx: "Wtp", token: str) -> None: # Try to parse it as a start tag m = re.match( - r"""<([-a-zA-Z0-9]+)\s*((\b[-a-zA-Z0-9]+(=("[^"]*"|""" + r"""<([-a-zA-Z0-9]+)\s*((\b[-a-zA-Z0-9:]+(=("[^"]*"|""" r"""'[^']*'|[^ \t\n"'`=<>/]*))?\s*)*)(/?)\s*>""", token, ) @@ -1955,8 +1956,8 @@ def magicword_fn(ctx: "Wtp", token: str) -> None: r"[ \t]+\n*|" r":|" # sometimes special when not beginning of line r"<<[-a-zA-Z0-9/]*>>|" - r"""<[-a-zA-Z0-9]+\s*(\b[-a-zA-Z0-9]+(=("[^<>"]*"|""" - r"""'[^<>']*'|[^ \t\n"'`=<>]*))?\s*)*/?>|""" + r"""<[-a-zA-Z0-9]+\s*(\b[-a-zA-Z0-9:]+(=("[^<>"]*"|""" # HTML start tag + r"""'[^<>']*'|[^ \t\n"'`=<>]*))?\s*)*/?>|""" # HTML start tag r"|" r"(" + r"|".join(r"\b{}\b".format(x) for x in MAGIC_WORDS) diff --git a/tests/test_parser.py b/tests/test_parser.py index df193e02..29fa1066 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2425,7 +2425,7 @@ def test_latex_math_tag_template_parameter(self): self.assertEqual(template_node.template_name, "quote-book") self.assertEqual( template_node.template_parameters, - {1: "en", 2: "\\frac{1}{2}"} + {1: "en", 2: "\\frac{1}{2}"}, ) def test_match_template_contains_unpaired_curly_brackets(self): @@ -2441,6 +2441,19 @@ def test_find_two_kinds_of_nodes(self): self.assertTrue(isinstance(found_nodes[0], TemplateNode)) self.assertTrue(isinstance(found_nodes[1], HTMLNode)) + def test_parse_html_with_xml_attribute(self): + # https://fr.wiktionary.org/wiki/autrice + # expanded from template "équiv-pour" + # https://fr.wiktionary.org/wiki/Modèle:équiv-pour + tree = self.parse( + "", + '[[auteur#fr|auteur]]', + ) + self.assertTrue(isinstance(tree.children[0], HTMLNode)) + self.assertEqual(tree.children[0].tag, "bdi") + self.assertEqual(tree.children[0].children[0].kind, NodeKind.LINK) + + # XXX implement marking for links, templates # - https://en.wikipedia.org/wiki/Help:Wikitext#Nowiki # - fix test_nowiki11 and continue