From b731fd772bb4c313e6582e1bd3e6298cd8ccec19 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 12 Aug 2024 14:01:05 +1000 Subject: [PATCH] Simplified Entities.escape to not require an OutputSettings This is a cleaner decoupling of OutputSettings and Entities than the previous impl which required a lazy initialisation of OutputSettings. Also simplified how we get a fallback encoder. Related to #1910, #2042 --- src/main/java/org/jsoup/nodes/Document.java | 43 ++++------- src/main/java/org/jsoup/nodes/Entities.java | 83 ++++++++++++--------- src/main/java/org/jsoup/nodes/Node.java | 1 - 3 files changed, 65 insertions(+), 62 deletions(-) diff --git a/src/main/java/org/jsoup/nodes/Document.java b/src/main/java/org/jsoup/nodes/Document.java index a9a12f0672..7246f32679 100644 --- a/src/main/java/org/jsoup/nodes/Document.java +++ b/src/main/java/org/jsoup/nodes/Document.java @@ -14,7 +14,6 @@ import org.jspecify.annotations.Nullable; import java.nio.charset.Charset; -import java.nio.charset.CharsetEncoder; import java.util.List; import static org.jsoup.parser.Parser.NamespaceHtml; @@ -395,29 +394,32 @@ public static class OutputSettings implements Cloneable { * The output serialization syntax. */ public enum Syntax {html, xml} - private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; - private Charset charset; - Entities.CoreCharset coreCharset; // fast encoders for ascii and utf8 - private final ThreadLocal encoderThreadLocal = new ThreadLocal<>(); // initialized by start of OuterHtmlVisitor - + private Charset charset = DataUtil.UTF_8; private boolean prettyPrint = true; private boolean outline = false; private int indentAmount = 1; private int maxPaddingWidth = 30; private Syntax syntax = Syntax.html; + /** + Create a new OutputSettings object, with the default settings (UTF-8, HTML, EscapeMode.base, pretty-printing, + indent amount of 1). + */ public OutputSettings() { - charset(DataUtil.UTF_8); } - + /** - * Get the document's current HTML escape mode: base, which provides a limited set of named HTML - * entities and escapes other characters as numbered entities for maximum compatibility; or extended, - * which uses the complete set of HTML named entities. - *

- * The default escape mode is base. - * @return the document's current escape mode + Get the document's current entity escape mode: +

+

The default escape mode is base. + @return the document's current escape mode */ public Entities.EscapeMode escapeMode() { return escapeMode; @@ -453,7 +455,6 @@ public Charset charset() { */ public OutputSettings charset(Charset charset) { this.charset = charset; - coreCharset = Entities.CoreCharset.byName(charset.name()); return this; } @@ -467,18 +468,6 @@ public OutputSettings charset(String charset) { return this; } - CharsetEncoder prepareEncoder() { - // created at start of OuterHtmlVisitor so each pass has own encoder, so OutputSettings can be shared among threads - CharsetEncoder encoder = charset.newEncoder(); - encoderThreadLocal.set(encoder); - return encoder; - } - - CharsetEncoder encoder() { - CharsetEncoder encoder = encoderThreadLocal.get(); - return encoder != null ? encoder : prepareEncoder(); - } - /** * Get the document's current output syntax. * @return current syntax diff --git a/src/main/java/org/jsoup/nodes/Entities.java b/src/main/java/org/jsoup/nodes/Entities.java index d171f8dfc9..c4503dd02e 100644 --- a/src/main/java/org/jsoup/nodes/Entities.java +++ b/src/main/java/org/jsoup/nodes/Entities.java @@ -1,14 +1,15 @@ package org.jsoup.nodes; import org.jsoup.SerializationException; +import org.jsoup.helper.DataUtil; import org.jsoup.internal.StringUtil; import org.jsoup.helper.Validate; import org.jsoup.nodes.Document.OutputSettings; import org.jsoup.parser.CharacterReader; import org.jsoup.parser.Parser; -import org.jspecify.annotations.Nullable; import java.io.IOException; +import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.util.Arrays; import java.util.HashMap; @@ -136,51 +137,55 @@ public static int codepointsForName(final String name, final int[] codepoints) { /** HTML escape an input string. That is, {@code <} is returned as {@code <}. The escaped string is suitable for use both in attributes and in text data. - @param string the un-escaped string to escape + @param data the un-escaped string to escape @param out the output settings to use. This configures the character set escaped against (that is, if a character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML settings. @return the escaped string */ - public static String escape(String string, OutputSettings out) { - if (string == null) + public static String escape(String data, OutputSettings out) { + return escapeString(data, out.escapeMode(), out.syntax(), out.charset()); + } + + /** + HTML escape an input string, using the default settings (UTF-8, base entities, HTML syntax). That is, {@code <} is + returned as {@code <}. The escaped string is suitable for use both in attributes and in text data. + @param data the un-escaped string to escape + @return the escaped string + @see #escape(String, OutputSettings) + */ + public static String escape(String data) { + return escapeString(data, base, Syntax.html, DataUtil.UTF_8); + } + + private static String escapeString(String data, EscapeMode escapeMode, Syntax syntax, Charset charset) { + if (data == null) return ""; StringBuilder accum = StringUtil.borrowBuilder(); try { - escape(accum, string, out, ForText | ForAttribute); // for text and for attribute; preserve whitespaces + doEscape(data, accum, escapeMode, syntax, charset, ForText | ForAttribute); } catch (IOException e) { throw new SerializationException(e); // doesn't happen } return StringUtil.releaseBuilder(accum); } - /** - * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as - * {@code <}. The escaped string is suitable for use both in attributes and in text data. - * - * @param string the un-escaped string to escape - * @return the escaped string - * @see #escape(String, OutputSettings) - */ - public static String escape(String string) { - if (DefaultOutput == null) - DefaultOutput = new OutputSettings(); - return escape(string, DefaultOutput); + + static void escape(Appendable accum, String data, OutputSettings out, int options) throws IOException { + doEscape(data, accum, out.escapeMode(), out.syntax(), out.charset(), options); } - private static @Nullable OutputSettings DefaultOutput; // lazy-init, to break circular dependency with OutputSettings - static void escape(Appendable accum, String string, OutputSettings out, int options) throws IOException { - final EscapeMode escapeMode = out.escapeMode(); - final CharsetEncoder encoder = out.encoder(); - final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder() - final int length = string.length(); + private static void doEscape(String data, Appendable accum, EscapeMode mode, Syntax syntax, Charset charset, int options) throws IOException { + final CoreCharset coreCharset = CoreCharset.byName(charset.name()); + final CharsetEncoder fallback = encoderFor(charset); + final int length = data.length(); int codePoint; boolean lastWasWhite = false; boolean reachedNonWhite = false; boolean skipped = false; for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) { - codePoint = string.codePointAt(offset); + codePoint = data.codePointAt(offset); if ((options & Normalise) != 0) { if (StringUtil.isWhitespace(codePoint)) { @@ -202,12 +207,12 @@ static void escape(Appendable accum, String string, OutputSettings out, int opti } } } - appendEscaped(accum, out, options, codePoint, escapeMode, encoder, coreCharset); + appendEscaped(codePoint, accum, options, mode, syntax, coreCharset, fallback); } } - private static void appendEscaped(Appendable accum, OutputSettings out, int options, - int codePoint, EscapeMode escapeMode, CharsetEncoder encoder, CoreCharset coreCharset) throws IOException { + private static void appendEscaped(int codePoint, Appendable accum, int options, EscapeMode escapeMode, + Syntax syntax, CoreCharset coreCharset, CharsetEncoder fallback) throws IOException { // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): final char c = (char) codePoint; @@ -222,7 +227,7 @@ private static void appendEscaped(Appendable accum, OutputSettings out, int opti break; case '<': // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val - appendLt(accum, options, escapeMode, out); + appendLt(accum, options, escapeMode, syntax); break; case '>': if ((options & ForText) != 0) accum.append(">"); @@ -243,11 +248,11 @@ private static void appendEscaped(Appendable accum, OutputSettings out, int opti accum.append(c); break; default: - if (c < 0x20 || !canEncode(coreCharset, c, encoder)) appendEncoded(accum, escapeMode, codePoint); + if (c < 0x20 || !canEncode(coreCharset, c, fallback)) appendEncoded(accum, escapeMode, codePoint); else accum.append(c); } } else { - if (canEncode(coreCharset, c, encoder)) { + if (canEncode(coreCharset, c, fallback)) { // reads into charBuf - we go through these steps to avoid GC objects as much as possible (would be a new String and a new char[2] for each character) char[] chars = charBuf.get(); int len = Character.toChars(codePoint, chars, 0); @@ -268,9 +273,9 @@ private static void appendNbsp(Appendable accum, EscapeMode escapeMode) throws I else accum.append(" "); } - private static void appendLt(Appendable accum, int options, EscapeMode escapeMode, OutputSettings out) throws IOException { - if ((options & ForText) != 0 || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml) accum.append("<"); - else accum.append('<'); + private static void appendLt(Appendable accum, int options, EscapeMode escapeMode, Syntax syntax) throws IOException { + if ((options & ForText) != 0 || escapeMode == EscapeMode.xhtml || syntax == Syntax.xml) accum.append("<"); + else accum.append('<'); // no need to escape < when in an HTML attribute } private static void appendApos(Appendable accum, int options, EscapeMode escapeMode) throws IOException { @@ -282,7 +287,6 @@ private static void appendApos(Appendable accum, int options, EscapeMode escapeM } } - private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException { final String name = escapeMode.nameForCodepoint(codePoint); if (!emptyName.equals(name)) // ok for identity check @@ -349,6 +353,17 @@ static CoreCharset byName(final String name) { } } + // cache the last used fallback encoder to save recreating on every use + private static final ThreadLocal LocalEncoder = new ThreadLocal<>(); + private static CharsetEncoder encoderFor(Charset charset) { + CharsetEncoder encoder = LocalEncoder.get(); + if (encoder == null || !encoder.charset().equals(charset)) { + encoder = charset.newEncoder(); + LocalEncoder.set(encoder); + } + return encoder; + } + private static void load(EscapeMode e, String pointsData, int size) { e.nameKeys = new String[size]; e.codeVals = new int[size]; diff --git a/src/main/java/org/jsoup/nodes/Node.java b/src/main/java/org/jsoup/nodes/Node.java index b167286ed2..856851eca2 100644 --- a/src/main/java/org/jsoup/nodes/Node.java +++ b/src/main/java/org/jsoup/nodes/Node.java @@ -942,7 +942,6 @@ private static class OuterHtmlVisitor implements NodeVisitor { OuterHtmlVisitor(Appendable accum, Document.OutputSettings out) { this.accum = accum; this.out = out; - out.prepareEncoder(); } public void head(Node node, int depth) {