Skip to content

Commit

Permalink
Allow matching HTML tags that have XML attribute
Browse files Browse the repository at this point in the history
Add ":" to the HTML tag regex pattern and fix a regex flag error in
the `inside_html_tags_re` variable.
  • Loading branch information
xxyzz committed Oct 24, 2023
1 parent e45d551 commit e535721
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 5 deletions.
9 changes: 5 additions & 4 deletions src/wikitextprocessor/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,8 @@ class NodeKind(enum.Flag):
# This means that if you have nesting capturing groups,
# the contents will be repeated partly.
inside_html_tags_re = re.compile(
r"(<(?i:" + r"|".join(ALLOWED_HTML_TAGS.keys()) + r")\s+[^><]*>)"
r"(<(?:" + r"|".join(ALLOWED_HTML_TAGS.keys()) + r")[^><]*>)",
re.IGNORECASE
)

# We don't have specs for this, so let's assume...
Expand Down Expand Up @@ -1738,7 +1739,7 @@ def tag_fn(ctx: "Wtp", token: str) -> None:

# Try to parse it as a start tag
m = re.match(
r"""<([-a-zA-Z0-9]+)\s*((\b[-a-zA-Z0-9]+(=("[^"]*"|"""
r"""<([-a-zA-Z0-9]+)\s*((\b[-a-zA-Z0-9:]+(=("[^"]*"|"""
r"""'[^']*'|[^ \t\n"'`=<>/]*))?\s*)*)(/?)\s*>""",
token,
)
Expand Down Expand Up @@ -1955,8 +1956,8 @@ def magicword_fn(ctx: "Wtp", token: str) -> None:
r"[ \t]+\n*|"
r":|" # sometimes special when not beginning of line
r"<<[-a-zA-Z0-9/]*>>|"
r"""<[-a-zA-Z0-9]+\s*(\b[-a-zA-Z0-9]+(=("[^<>"]*"|"""
r"""'[^<>']*'|[^ \t\n"'`=<>]*))?\s*)*/?>|"""
r"""<[-a-zA-Z0-9]+\s*(\b[-a-zA-Z0-9:]+(=("[^<>"]*"|""" # HTML start tag
r"""'[^<>']*'|[^ \t\n"'`=<>]*))?\s*)*/?>|""" # HTML start tag
r"</[-a-zA-Z0-9]+\s*>|"
r"("
+ r"|".join(r"\b{}\b".format(x) for x in MAGIC_WORDS)
Expand Down
15 changes: 14 additions & 1 deletion tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2425,7 +2425,7 @@ def test_latex_math_tag_template_parameter(self):
self.assertEqual(template_node.template_name, "quote-book")
self.assertEqual(
template_node.template_parameters,
{1: "en", 2: "<math>\\frac{1}{2}</math>"}
{1: "en", 2: "<math>\\frac{1}{2}</math>"},
)

def test_match_template_contains_unpaired_curly_brackets(self):
Expand All @@ -2441,6 +2441,19 @@ def test_find_two_kinds_of_nodes(self):
self.assertTrue(isinstance(found_nodes[0], TemplateNode))
self.assertTrue(isinstance(found_nodes[1], HTMLNode))

def test_parse_html_with_xml_attribute(self):
# https://fr.wiktionary.org/wiki/autrice
# expanded from template "équiv-pour"
# https://fr.wiktionary.org/wiki/Modèle:équiv-pour
tree = self.parse(
"",
'<bdi lang="fr" xml:lang="fr" class="lang-fr">[[auteur#fr|auteur]]</bdi>',
)
self.assertTrue(isinstance(tree.children[0], HTMLNode))
self.assertEqual(tree.children[0].tag, "bdi")
self.assertEqual(tree.children[0].children[0].kind, NodeKind.LINK)


# XXX implement <nowiki/> marking for links, templates
# - https://en.wikipedia.org/wiki/Help:Wikitext#Nowiki
# - fix test_nowiki11 and continue
Expand Down

0 comments on commit e535721

Please sign in to comment.