Skip to content

Commit

Permalink
Follow spec so < can start an attribute name
Browse files Browse the repository at this point in the history
Fixes #1483
  • Loading branch information
jhy committed Nov 24, 2024
1 parent 0ef4b70 commit d27370a
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 10 deletions.
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)
* Follow the current HTML specification in the tokenizer to allow `<` as part of a tag name, instead of emitting it as a
character node. [2230](https://github.com/jhy/jsoup/issues/2230)
* Similarly, allow a `<` as the start of an attribute name, vs creating a new element. The previous behavior was
intended to parse closer to what we anticipated the author's intent to be, but that does not align to the spec or to
how browsers behave. [1483](https://github.com/jhy/jsoup/issues/1483)

## 1.18.1 (2024-Jul-10)

Expand Down
4 changes: 0 additions & 4 deletions src/main/java/org/jsoup/parser/TokeniserState.java
Original file line number Diff line number Diff line change
Expand Up @@ -568,10 +568,6 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
case '/':
t.transition(SelfClosingStartTag);
break;
case '<': // NOTE: out of spec, but clear (spec has this as a part of the attribute name)
r.unconsume();
t.error(this);
// intended fall through as if >
case '>':
t.emitTagPending();
t.transition(Data);
Expand Down
23 changes: 18 additions & 5 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,13 @@ private static Stream<Arguments> dupeAttributeData() {

@Test public void parsesQuiteRoughAttributes() {
String html = "<p =a>One<a <p>Something</p>Else";
// this (used to; now gets cleaner) gets a <p> with attr '=a' and an <a tag with an attribute named '<p'; and then auto-recreated
// this gets a <p> with attr '=a' and an <a tag with an attribute named '<p'; and then auto-recreated
Document doc = Jsoup.parse(html);

// NOTE: per spec this should be the test case. but impacts too many ppl
// assertEquals("<p =a>One<a <p>Something</a></p>\n<a <p>Else</a>", doc.body().html());

assertEquals("<p _a>One<a></a></p><p><a>Something</a></p><a>Else</a>", TextUtil.stripNewlines(doc.body().html()));
// =a is output as _a
assertEquals("<p _a>One<a <p>Something</a></p><a <p>Else</a>", TextUtil.stripNewlines(doc.body().html()));
Element p = doc.expectFirst("p");
assertNotNull(p.attribute("=a"));

doc = Jsoup.parse("<p .....>");
assertEquals("<p .....></p>", doc.body().html());
Expand Down Expand Up @@ -1939,4 +1939,17 @@ private static void assertMathNamespace(Element el) {
assertEquals("Hello", ab.text());
assertEquals("a<b", ab.tag().normalName());
}

@Test void ltInAttrStart() {
// https://github.com/jhy/jsoup/issues/1483
String html = "<a before='foo' <junk after='bar'>One</a>";
Document doc = Jsoup.parse(html);
assertEquals("<a before=\"foo\" <junk after=\"bar\">One</a>", TextUtil.normalizeSpaces(doc.body().html()));

Element el = doc.expectFirst("a");
Attribute attribute = el.attribute("<junk");
assertNotNull(attribute);
assertEquals("", attribute.getValue());

}
}
2 changes: 1 addition & 1 deletion src/test/java/org/jsoup/parser/TokeniserStateTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ public void testOpeningAngleBracketInsteadOfAttribute() {

Parser.parseFragment(triggeringSnippet, null, "", errorList);

assertEquals(6, errorList.get(0).getPosition());
assertEquals(7, errorList.get(0).getPosition());
}

@Test
Expand Down

0 comments on commit d27370a

Please sign in to comment.