From a2e084a0954ef3bb442be5ad23ca8a6eafe46557 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 22 Feb 2024 13:36:07 -0600 Subject: [PATCH] CLDR-17371 perf improvement for DTD inclusion - no statistically significant increase in parsing cost over 10,000 iterations. --- ...tory.java => DoctypeXmlStreamWrapper.java} | 97 ++++++++++++++----- .../org/unicode/cldr/util/XMLFileReader.java | 2 +- .../org/unicode/cldr/unittest/TestBasic.java | 4 +- ....java => TestDoctypeXmlStreamWrapper.java} | 23 +++-- 4 files changed, 91 insertions(+), 35 deletions(-) rename tools/cldr-code/src/main/java/org/unicode/cldr/util/{DTDInsertingReaderFactory.java => DoctypeXmlStreamWrapper.java} (58%) rename tools/cldr-code/src/test/java/org/unicode/cldr/util/{TestDTDInsertingReaderFactory.java => TestDoctypeXmlStreamWrapper.java} (83%) diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DTDInsertingReaderFactory.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DoctypeXmlStreamWrapper.java similarity index 58% rename from tools/cldr-code/src/main/java/org/unicode/cldr/util/DTDInsertingReaderFactory.java rename to tools/cldr-code/src/main/java/org/unicode/cldr/util/DoctypeXmlStreamWrapper.java index 9f7e274d0ad..f1e9f7370e9 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DTDInsertingReaderFactory.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DoctypeXmlStreamWrapper.java @@ -6,19 +6,27 @@ import java.io.PushbackReader; import java.io.Reader; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.util.Arrays; +import org.unicode.cldr.icu.LDMLConstants; import org.xml.sax.InputSource; -public class DTDInsertingReaderFactory { +public class DoctypeXmlStreamWrapper { + private static final String DOCTYPE = ""); if (n == -1) { throw new IllegalArgumentException("Invalid XML prefix: ?> not found."); } + n += 2; // move the cut-point to the end of the "?>" sequence final String doctype = "\n" + d.getDoctype() + "\n"; - final String s2 = s.substring(0, n + 2) + doctype + s.substring(n + 2); - - if (false) System.out.println(s2); // DEBUG: print out updated header + final String s2 = s.substring(0, n) + doctype + s.substring(n); return s2; } - public static Reader wrap(Reader src) throws IOException { - PushbackReader pr = new PushbackReader(src, BUFFER_MAX_SIZE); - char inbuf[] = new char[BUFFER_READ_SIZE]; - int readlen = pr.read(inbuf); - if (!hasDocType(inbuf, readlen)) { - char buf2[] = Arrays.copyOf(inbuf, readlen); - inbuf = fixup(new String(buf2)).toCharArray(); - readlen = inbuf.length; - } - pr.unread(inbuf, 0, readlen); - return pr; - } - - private static boolean hasDocType(byte[] inbuf, String encoding) { + private static final boolean hasDocType(byte[] inbuf, String encoding) { if (inbuf == null || inbuf.length == 0) return false; + + // Try as utf-8/ASCII bytes - this will be the common case + if (arrayContains(inbuf, inbuf.length, DOCTYPE_BYTES)) return true; + + // break out here + if (encoding == null || encoding.equals("UTF-8")) return false; + + // Try 2, with encoding try { final String s = new String(inbuf, encoding); - return hasDocType(s.toCharArray(), s.length()); + return s.contains(DOCTYPE); } catch (UnsupportedEncodingException e) { throw new RuntimeException("While parsing " + encoding, e); } } - private static boolean hasDocType(char[] inbuf, int readlen) { + private static final boolean hasDocType(char[] inbuf, int readlen) { if (inbuf == null || readlen <= 0) { return false; } - final String s = new String(inbuf, 0, readlen); - return (s.contains("