Allow matching HTML tags that have XML attribute

Add ":" to the HTML tag regex pattern and fix a regex flag error in the `inside_html_tags_re` variable.
empiriker · Oct 24, 2023 · e535721 · e535721
1 parent e45d551
commit e535721
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 5 deletions.
diff --git a/src/wikitextprocessor/parser.py b/src/wikitextprocessor/parser.py
@@ -285,7 +285,8 @@ class NodeKind(enum.Flag):
 # This means that if you have nesting capturing groups,
 # the contents will be repeated partly.
 inside_html_tags_re = re.compile(
-    r"(<(?i:" + r"|".join(ALLOWED_HTML_TAGS.keys()) + r")\s+[^><]*>)"
+    r"(<(?:" + r"|".join(ALLOWED_HTML_TAGS.keys()) + r")[^><]*>)",
+    re.IGNORECASE
 )
 
 # We don't have specs for this, so let's assume...
@@ -1738,7 +1739,7 @@ def tag_fn(ctx: "Wtp", token: str) -> None:
 
     # Try to parse it as a start tag
     m = re.match(
-        r"""<([-a-zA-Z0-9]+)\s*((\b[-a-zA-Z0-9]+(=("[^"]*"|"""
+        r"""<([-a-zA-Z0-9]+)\s*((\b[-a-zA-Z0-9:]+(=("[^"]*"|"""
         r"""'[^']*'|[^ \t\n"'`=<>/]*))?\s*)*)(/?)\s*>""",
         token,
     )
@@ -1955,8 +1956,8 @@ def magicword_fn(ctx: "Wtp", token: str) -> None:
     r"[ \t]+\n*|"
     r":|"  # sometimes special when not beginning of line
     r"<<[-a-zA-Z0-9/]*>>|"
-    r"""<[-a-zA-Z0-9]+\s*(\b[-a-zA-Z0-9]+(=("[^<>"]*"|"""
-    r"""'[^<>']*'|[^ \t\n"'`=<>]*))?\s*)*/?>|"""
+    r"""<[-a-zA-Z0-9]+\s*(\b[-a-zA-Z0-9:]+(=("[^<>"]*"|"""  # HTML start tag
+    r"""'[^<>']*'|[^ \t\n"'`=<>]*))?\s*)*/?>|"""  # HTML start tag
     r"</[-a-zA-Z0-9]+\s*>|"
     r"("
     + r"|".join(r"\b{}\b".format(x) for x in MAGIC_WORDS)

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -2425,7 +2425,7 @@ def test_latex_math_tag_template_parameter(self):
         self.assertEqual(template_node.template_name, "quote-book")
         self.assertEqual(
             template_node.template_parameters,
-            {1: "en", 2: "<math>\\frac{1}{2}</math>"}
+            {1: "en", 2: "<math>\\frac{1}{2}</math>"},
         )
 
     def test_match_template_contains_unpaired_curly_brackets(self):
@@ -2441,6 +2441,19 @@ def test_find_two_kinds_of_nodes(self):
         self.assertTrue(isinstance(found_nodes[0], TemplateNode))
         self.assertTrue(isinstance(found_nodes[1], HTMLNode))
 
+    def test_parse_html_with_xml_attribute(self):
+        # https://fr.wiktionary.org/wiki/autrice
+        # expanded from template "équiv-pour"
+        # https://fr.wiktionary.org/wiki/Modèle:équiv-pour
+        tree = self.parse(
+            "",
+            '<bdi lang="fr" xml:lang="fr" class="lang-fr">[[auteur#fr|auteur]]</bdi>',
+        )
+        self.assertTrue(isinstance(tree.children[0], HTMLNode))
+        self.assertEqual(tree.children[0].tag, "bdi")
+        self.assertEqual(tree.children[0].children[0].kind, NodeKind.LINK)
+
+
 # XXX implement <nowiki/> marking for links, templates
 #  - https://en.wikipedia.org/wiki/Help:Wikitext#Nowiki
 #  - fix test_nowiki11 and continue