Skip to content

Commit

Permalink
CLDR-17371 perf improvement for DTD inclusion
Browse files Browse the repository at this point in the history
- no statistically significant increase in parsing cost over 10,000 iterations.
  • Loading branch information
srl295 committed Feb 22, 2024
1 parent 34c7b2e commit 9ffb82d
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,27 @@
import java.io.PushbackReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.unicode.cldr.icu.LDMLConstants;
import org.xml.sax.InputSource;

public class DTDInsertingReaderFactory {
public class DoctypeXmlStreamWrapper {
private static final String DOCTYPE = "<!DOCTYPE";
private static final char DOCTYPE_CHARS[] = DOCTYPE.toCharArray();
private static final byte DOCTYPE_BYTES[] = DOCTYPE.getBytes(StandardCharsets.UTF_8);
// the string to look for: xmlns="
private static final String XMLNS_EQUALS = LDMLConstants.XMLNS + "=\"";
/**
* Size of the input buffer, needs to be able to handle any expansion when the header is updated
*/
public static int BUFFER_MAX_SIZE = 1024;
/** Size of the first read, needs to contain xmlns="..." and be less than BUFFER_MAX_SIZE */
public static int BUFFER_READ_SIZE = 512;

/**
* Wrap a Reader in something that will automatically insert a DTD reference in place of an
* xmlns directive.
* Wrap an InputSource in something that will automatically insert a DTD reference in place of
* an xmlns directive.
*
* @throws IOException
*/
Expand All @@ -36,6 +44,7 @@ public static InputSource wrap(InputSource src) throws IOException {
return src;
}

/** wrap a byte oriented stream */
public static InputStream wrap(InputStream src, String encoding) throws IOException {
if (encoding == null) {
encoding = "UTF-8";
Expand All @@ -49,6 +58,21 @@ public static InputStream wrap(InputStream src, String encoding) throws IOExcept
return pr;
}

/** wrap a char oriented stream */
public static Reader wrap(Reader src) throws IOException {
PushbackReader pr = new PushbackReader(src, BUFFER_MAX_SIZE);
char inbuf[] = new char[BUFFER_READ_SIZE];
int readlen = pr.read(inbuf);
if (!hasDocType(inbuf, readlen)) {
char buf2[] = Arrays.copyOf(inbuf, readlen);
inbuf = fixup(new String(buf2)).toCharArray();
readlen = inbuf.length;
}
pr.unread(inbuf, 0, readlen);
return pr;
}

/** Fix an input byte array, including the DOCTYPE */
private static String fixup(byte[] inbuf, String encoding) {
try {
final String s = new String(inbuf, encoding);
Expand All @@ -58,58 +82,81 @@ private static String fixup(byte[] inbuf, String encoding) {
}
}

/** Fix an input String, including DOCTYPE */
private static String fixup(final String s) {
// exit if nothing matches
for (final DtdType d : DtdType.values()) {
if (s.contains("xmlns=\"" + d.getNsUrl())) {
if (s.contains(XMLNS_EQUALS + d.getNsUrl())) {
return fixup(s, d);
}
}
// couldn't fix it, just pass through
return s;
}

/** Fix an input String given a specific DtdType. */
private static String fixup(String s, DtdType d) {
int n = s.indexOf("?>");
if (n == -1) {
throw new IllegalArgumentException("Invalid XML prefix: ?> not found.");
}
n += 2; // move the cut-point to the end of the "?>" sequence

final String doctype = "\n" + d.getDoctype() + "\n";
final String s2 = s.substring(0, n + 2) + doctype + s.substring(n + 2);

if (false) System.out.println(s2); // DEBUG: print out updated header
final String s2 = s.substring(0, n) + doctype + s.substring(n);
return s2;
}

public static Reader wrap(Reader src) throws IOException {
PushbackReader pr = new PushbackReader(src, BUFFER_MAX_SIZE);
char inbuf[] = new char[BUFFER_READ_SIZE];
int readlen = pr.read(inbuf);
if (!hasDocType(inbuf, readlen)) {
char buf2[] = Arrays.copyOf(inbuf, readlen);
inbuf = fixup(new String(buf2)).toCharArray();
readlen = inbuf.length;
}
pr.unread(inbuf, 0, readlen);
return pr;
}

private static boolean hasDocType(byte[] inbuf, String encoding) {
private static final boolean hasDocType(byte[] inbuf, String encoding) {
if (inbuf == null || inbuf.length == 0) return false;

// Try as utf-8/ASCII bytes - this will be the common case
if (arrayContains(inbuf, inbuf.length, DOCTYPE_BYTES)) return true;

// break out here
if (encoding == null || encoding.equals("UTF-8")) return false;

// Try 2, with encoding
try {
final String s = new String(inbuf, encoding);
return hasDocType(s.toCharArray(), s.length());
return s.contains(DOCTYPE);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("While parsing " + encoding, e);
}
}

private static boolean hasDocType(char[] inbuf, int readlen) {
private static final boolean hasDocType(char[] inbuf, int readlen) {
if (inbuf == null || readlen <= 0) {
return false;
}
final String s = new String(inbuf, 0, readlen);
return (s.contains("<!DOCTYPE"));
return arrayContains(inbuf, readlen, DOCTYPE_CHARS);
}

private static boolean arrayContains(char[] inbuf, int inlen, char[] testbuf) {
final int testlen = testbuf.length;
int t = 0;
for (int i = 0; i < inlen; i++) {
if (inbuf[i] == testbuf[t]) {
t++;
if (t == testlen) return true;
} else {
t = 0;
}
}
return false;
}

private static boolean arrayContains(byte[] inbuf, int inlen, byte[] testbuf) {
final int testlen = testbuf.length;
int t = 0;
for (int i = 0; i < inlen; i++) {
if (inbuf[i] == testbuf[t]) {
t++;
if (t == testlen) return true;
} else {
t = 0;
}
}
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ public static void read(
try {
XMLReader xmlReader = createXMLReader(handlers, validating, allHandler);
// wrap the reader to insert a character stream
DTDInsertingReaderFactory.wrap(is);
DoctypeXmlStreamWrapper.wrap(is);
is.setSystemId(systemID);
try {
xmlReader.parse(is);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@
import org.unicode.cldr.util.CharacterFallbacks;
import org.unicode.cldr.util.CldrUtility;
import org.unicode.cldr.util.Counter;
import org.unicode.cldr.util.DTDInsertingReaderFactory;
import org.unicode.cldr.util.DiscreteComparator;
import org.unicode.cldr.util.DiscreteComparator.Ordering;
import org.unicode.cldr.util.DoctypeXmlStreamWrapper;
import org.unicode.cldr.util.DtdData;
import org.unicode.cldr.util.DtdData.Attribute;
import org.unicode.cldr.util.DtdData.Element;
Expand Down Expand Up @@ -377,7 +377,7 @@ public TimingInfo check(File systemID) {
xmlReader.setErrorHandler(new MyErrorHandler());
InputSource is = new InputSource(fis);
is.setSystemId(systemID.toString());
DTDInsertingReaderFactory.wrap(is);
DoctypeXmlStreamWrapper.wrap(is);
xmlReader.parse(is);
// fis.close();
} catch (SAXException | IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
Expand All @@ -13,11 +14,19 @@
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public class TestDTDInsertingReaderFactory {
public class TestDoctypeXmlStreamWrapper {
private static final int COUNT = 10; // increase this for perf testing
private static final String COMMON_MT = "common/main/mt.xml";
private static final String KEYBOARDS_MT = "keyboards/3.0/mt.xml";

// make sure we get some basic loading first before starting the clock
@BeforeAll
public static final void SetupStuff() throws IOException {
TestDoctypeXmlStreamWrapper t = new TestDoctypeXmlStreamWrapper();
t.TestReadJar();
t.TestReadCommon();
}

@Test
void TestProcessPathValues() {
for (int i = 0; i < COUNT; i++) {
Expand Down Expand Up @@ -58,7 +67,7 @@ void TestReadKeyboardByte() throws IOException, SAXException {
try (InputStream fis = new FileInputStream(KEYBOARDS_MT); ) {
InputSource is = new InputSource(fis);
is.setSystemId(KEYBOARDS_MT);
is = DTDInsertingReaderFactory.wrap(is);
is = DoctypeXmlStreamWrapper.wrap(is);
XMLFileReader.createXMLReader(-1, true, new LoggingHandler()).parse(is);
}
}
Expand All @@ -70,35 +79,35 @@ void TestReadKeyboardChar() throws IOException, SAXException {
InputStreamReader isr = new InputStreamReader(fis); ) {
InputSource is = new InputSource(isr);
is.setSystemId(KEYBOARDS_MT);
is = DTDInsertingReaderFactory.wrap(is);
is = DoctypeXmlStreamWrapper.wrap(is);
XMLFileReader.createXMLReader(-1, true, new LoggingHandler()).parse(is);
}
}

@ParameterizedTest
@ValueSource(booleans = {false, true})
@ValueSource(booleans = {false, true, false, true})
public void TestBytePerf(boolean wrapped) throws IOException, SAXException {
for (int i = 0; i < COUNT; i++) {
// mimic XMLFileHandler.read() here, but with wrapping enabled/disabled
try (InputStream fis = new FileInputStream(COMMON_MT); ) {
InputSource is = new InputSource(fis);
is.setSystemId(COMMON_MT);
if (wrapped) is = DTDInsertingReaderFactory.wrap(is);
// if (wrapped) is = DoctypeXmlStreamWrapper.wrap(is);
XMLFileReader.createXMLReader(-1, true, new LoggingHandler()).parse(is);
}
}
}

@ParameterizedTest
@ValueSource(booleans = {false, true})
@ValueSource(booleans = {false, true, false, true})
public void TestCharPerf(boolean wrapped) throws IOException, SAXException {
for (int i = 0; i < COUNT; i++) {
// mimic XMLFileHandler.read() here, but with wrapping enabled/disabled
try (InputStream fis = new FileInputStream(COMMON_MT);
InputStreamReader isr = new InputStreamReader(fis); ) {
InputSource is = new InputSource(isr);
is.setSystemId(COMMON_MT);
if (wrapped) is = DTDInsertingReaderFactory.wrap(is);
// if (wrapped) is = DoctypeXmlStreamWrapper.wrap(is);
XMLFileReader.createXMLReader(-1, true, new LoggingHandler()).parse(is);
}
}
Expand Down

0 comments on commit 9ffb82d

Please sign in to comment.