From b633428a98452a85ada6f932fbc1db875f4f4fdc Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Mon, 26 Feb 2024 14:56:54 +0000 Subject: [PATCH] CLDR-17371 stream filter to replace xmlns with DOCTYPE See #3511 --- keyboards/3.0/bn.xml | 3 +- keyboards/3.0/fr-t-k0-azerty.xml | 3 +- keyboards/3.0/ja-Latn.xml | 3 +- keyboards/3.0/mt-t-k0-47key.xml | 3 +- keyboards/3.0/mt.xml | 3 +- keyboards/3.0/pcm.xml | 3 +- keyboards/3.0/pt-t-k0-abnt2.xml | 3 +- keyboards/dtd/ldmlKeyboard3.dtd | 3 + keyboards/dtd/ldmlKeyboard3.xsd | 208 +++++++++--------- .../java/org/unicode/cldr/util/CLDRURLS.java | 2 + .../cldr/util/DoctypeXmlStreamWrapper.java | 162 ++++++++++++++ .../java/org/unicode/cldr/util/DtdType.java | 16 ++ .../org/unicode/cldr/util/XMLFileReader.java | 4 +- .../org/unicode/cldr/unittest/TestBasic.java | 2 + .../util/TestDoctypeXmlStreamWrapper.java | 115 ++++++++++ .../KeyboardFlatten/broken-import-missing.xml | 6 +- .../broken-import-unknownbase.xml | 6 +- .../broken-import-unknownver.xml | 6 +- .../broken-import-wrongparent.xml | 6 +- tools/pom.xml | 19 ++ 20 files changed, 437 insertions(+), 139 deletions(-) create mode 100644 tools/cldr-code/src/main/java/org/unicode/cldr/util/DoctypeXmlStreamWrapper.java create mode 100644 tools/cldr-code/src/test/java/org/unicode/cldr/util/TestDoctypeXmlStreamWrapper.java diff --git a/keyboards/3.0/bn.xml b/keyboards/3.0/bn.xml index bbdbe60f5b4..88f897ff4dc 100644 --- a/keyboards/3.0/bn.xml +++ b/keyboards/3.0/bn.xml @@ -1,6 +1,5 @@ - - + - + diff --git a/keyboards/3.0/ja-Latn.xml b/keyboards/3.0/ja-Latn.xml index b5364c5f928..d48acb70b01 100644 --- a/keyboards/3.0/ja-Latn.xml +++ b/keyboards/3.0/ja-Latn.xml @@ -1,6 +1,5 @@ - - + diff --git a/keyboards/3.0/mt-t-k0-47key.xml b/keyboards/3.0/mt-t-k0-47key.xml index 62f90b47bce..788be91a296 100644 --- a/keyboards/3.0/mt-t-k0-47key.xml +++ b/keyboards/3.0/mt-t-k0-47key.xml @@ -1,6 +1,5 @@ - - + diff --git a/keyboards/3.0/mt.xml b/keyboards/3.0/mt.xml index 5dcc9ea5c35..64e29cc1dc8 100644 --- a/keyboards/3.0/mt.xml +++ b/keyboards/3.0/mt.xml @@ -1,12 +1,11 @@ - - + diff --git a/keyboards/3.0/pcm.xml b/keyboards/3.0/pcm.xml index 89994773f67..867e35ec4f2 100644 --- a/keyboards/3.0/pcm.xml +++ b/keyboards/3.0/pcm.xml @@ -1,6 +1,5 @@ - - + diff --git a/keyboards/3.0/pt-t-k0-abnt2.xml b/keyboards/3.0/pt-t-k0-abnt2.xml index 010f5150734..64977cd23bf 100644 --- a/keyboards/3.0/pt-t-k0-abnt2.xml +++ b/keyboards/3.0/pt-t-k0-abnt2.xml @@ -1,6 +1,5 @@ - - + diff --git a/keyboards/dtd/ldmlKeyboard3.dtd b/keyboards/dtd/ldmlKeyboard3.dtd index 633aece6729..a5160c3119e 100644 --- a/keyboards/dtd/ldmlKeyboard3.dtd +++ b/keyboards/dtd/ldmlKeyboard3.dtd @@ -17,6 +17,9 @@ Please view the subcommittee page for the most recent information. + + + diff --git a/keyboards/dtd/ldmlKeyboard3.xsd b/keyboards/dtd/ldmlKeyboard3.xsd index f314094239e..9a445135fc8 100644 --- a/keyboards/dtd/ldmlKeyboard3.xsd +++ b/keyboards/dtd/ldmlKeyboard3.xsd @@ -11,7 +11,7 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/) --> - - + + @@ -102,15 +102,15 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - - - - - - - - - + + + + + + + + + @@ -122,9 +122,9 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - - - + + + @@ -135,7 +135,7 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - + @@ -143,24 +143,24 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - - - - - - - + + + + + + + - - - - + + + + - + @@ -170,7 +170,7 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - + @@ -197,24 +197,24 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + @@ -233,18 +233,18 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - - + + - - - - + + + + @@ -254,7 +254,7 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - + @@ -264,16 +264,16 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - - + + - - - + + + @@ -285,9 +285,9 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - - - + + + @@ -298,18 +298,18 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - - - + + + - - - - + + + + @@ -321,36 +321,36 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - + - - - - + + + + - - - - + + + + - - - + + + @@ -368,8 +368,8 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - - + + @@ -382,20 +382,20 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - + - - - - - - - + + + + + + + @@ -411,4 +411,4 @@ Note: DTD @-annotations are not currently converted to .xsd. For full CLDR file - \ No newline at end of file + diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRURLS.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRURLS.java index fc94139ec89..c7aff333b12 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRURLS.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRURLS.java @@ -7,6 +7,8 @@ * @author srl */ public abstract class CLDRURLS { + public static final String CLDR_SCHEMA_BASE = "https://schemas.unicode.org/cldr"; + public static final String CLDR_CURVER_BASE = CLDR_SCHEMA_BASE + "/" + CLDRFile.GEN_VERSION; /** Base URL for the CLDR repository */ public static final String CLDR_REPO_BASE = "https://github.com/unicode-org/cldr"; diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DoctypeXmlStreamWrapper.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DoctypeXmlStreamWrapper.java new file mode 100644 index 00000000000..f1e9f7370e9 --- /dev/null +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DoctypeXmlStreamWrapper.java @@ -0,0 +1,162 @@ +package org.unicode.cldr.util; + +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; +import java.io.PushbackReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import org.unicode.cldr.icu.LDMLConstants; +import org.xml.sax.InputSource; + +public class DoctypeXmlStreamWrapper { + private static final String DOCTYPE = ""); + if (n == -1) { + throw new IllegalArgumentException("Invalid XML prefix: ?> not found."); + } + n += 2; // move the cut-point to the end of the "?>" sequence + + final String doctype = "\n" + d.getDoctype() + "\n"; + final String s2 = s.substring(0, n) + doctype + s.substring(n); + return s2; + } + + private static final boolean hasDocType(byte[] inbuf, String encoding) { + if (inbuf == null || inbuf.length == 0) return false; + + // Try as utf-8/ASCII bytes - this will be the common case + if (arrayContains(inbuf, inbuf.length, DOCTYPE_BYTES)) return true; + + // break out here + if (encoding == null || encoding.equals("UTF-8")) return false; + + // Try 2, with encoding + try { + final String s = new String(inbuf, encoding); + return s.contains(DOCTYPE); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("While parsing " + encoding, e); + } + } + + private static final boolean hasDocType(char[] inbuf, int readlen) { + if (inbuf == null || readlen <= 0) { + return false; + } + return arrayContains(inbuf, readlen, DOCTYPE_CHARS); + } + + private static boolean arrayContains(char[] inbuf, int inlen, char[] testbuf) { + final int testlen = testbuf.length; + int t = 0; + for (int i = 0; i < inlen; i++) { + if (inbuf[i] == testbuf[t]) { + t++; + if (t == testlen) return true; + } else { + t = 0; + } + } + return false; + } + + private static boolean arrayContains(byte[] inbuf, int inlen, byte[] testbuf) { + final int testlen = testbuf.length; + int t = 0; + for (int i = 0; i < inlen; i++) { + if (inbuf[i] == testbuf[t]) { + t++; + if (t == testlen) return true; + } else { + t = 0; + } + } + return false; + } +} diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DtdType.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DtdType.java index 70d5e41c95c..e629f26530b 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DtdType.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DtdType.java @@ -1,6 +1,7 @@ package org.unicode.cldr.util; import com.google.common.collect.ImmutableSet; +import java.io.File; import java.lang.annotation.Annotation; import java.util.Collections; import java.util.Set; @@ -154,4 +155,19 @@ public String rootElement() { public String getXsdPath() { return dtdPath.replaceAll("\\.dtd$", ".xsd"); } + + /** The xmlns name for this dtd type */ + public String getNsUrl() { + return CLDRURLS.CLDR_CURVER_BASE + "/" + name(); + } + + /** The current version DTD as a URI */ + String getDtdUri() { + return new File(CLDRPaths.BASE_DIRECTORY, dtdPath).toURI().toString(); + } + + /** DOCTYPE for this DTD (current version) */ + String getDoctype() { + return ""; + } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/XMLFileReader.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/XMLFileReader.java index 1b72a7da369..46549e9817b 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/XMLFileReader.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/XMLFileReader.java @@ -182,6 +182,8 @@ public static void read( AllHandler allHandler) { try { XMLReader xmlReader = createXMLReader(handlers, validating, allHandler); + // wrap the reader to insert a character stream + DoctypeXmlStreamWrapper.wrap(is); is.setSystemId(systemID); try { xmlReader.parse(is); @@ -198,7 +200,7 @@ public static void read( } } - private static final XMLReader createXMLReader( + public static final XMLReader createXMLReader( int handlers, boolean validating, AllHandler allHandler) throws SAXNotRecognizedException, SAXNotSupportedException { XMLReader xmlReader = createXMLReader(validating); diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java index d4925bf8115..ebd7783451c 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java @@ -58,6 +58,7 @@ import org.unicode.cldr.util.Counter; import org.unicode.cldr.util.DiscreteComparator; import org.unicode.cldr.util.DiscreteComparator.Ordering; +import org.unicode.cldr.util.DoctypeXmlStreamWrapper; import org.unicode.cldr.util.DtdData; import org.unicode.cldr.util.DtdData.Attribute; import org.unicode.cldr.util.DtdData.Element; @@ -376,6 +377,7 @@ public TimingInfo check(File systemID) { xmlReader.setErrorHandler(new MyErrorHandler()); InputSource is = new InputSource(fis); is.setSystemId(systemID.toString()); + DoctypeXmlStreamWrapper.wrap(is); xmlReader.parse(is); // fis.close(); } catch (SAXException | IOException e) { diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestDoctypeXmlStreamWrapper.java b/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestDoctypeXmlStreamWrapper.java new file mode 100644 index 00000000000..a534136a1f9 --- /dev/null +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/util/TestDoctypeXmlStreamWrapper.java @@ -0,0 +1,115 @@ +package org.unicode.cldr.util; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; +import org.unicode.cldr.util.XMLFileReader.LoggingHandler; +import org.unicode.cldr.util.XMLFileReader.SimpleHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +public class TestDoctypeXmlStreamWrapper { + private static final int COUNT = 10; // increase this for perf testing + private static final String COMMON_MT = CLDRPaths.BASE_DIRECTORY + "/common/main/mt.xml"; + private static final String KEYBOARDS_MT = CLDRPaths.BASE_DIRECTORY + "/keyboards/3.0/mt.xml"; + + // make sure we get some basic loading first before starting the clock + @BeforeAll + public static final void SetupStuff() throws IOException { + TestDoctypeXmlStreamWrapper t = new TestDoctypeXmlStreamWrapper(); + t.TestReadJar(); + t.TestReadCommon(); + } + + @Test + void TestProcessPathValues() { + for (int i = 0; i < COUNT; i++) { + XMLFileReader.processPathValues(COMMON_MT, true, new SimpleHandler()); + } + } + + @Test + void TestReadJar() throws IOException { + for (int i = 0; i < COUNT; i++) { + new XMLFileReader() + .setHandler(new XMLFileReader.SimpleHandler()) + .readCLDRResource("dl_iso_table_a1.xml", -1, false); + } + } + + @Test + void TestReadCommon() throws FileNotFoundException, IOException { + for (int i = 0; i < COUNT; i++) { + new XMLFileReader() + .setHandler(new XMLFileReader.SimpleHandler()) + .read(COMMON_MT, -1, true); + } + } + + @Test + void TestReadKeyboard() throws FileNotFoundException, IOException { + for (int i = 0; i < COUNT; i++) { + new XMLFileReader() + .setHandler(new XMLFileReader.SimpleHandler()) + .read(KEYBOARDS_MT, -1, true); + } + } + + @Test + void TestReadKeyboardByte() throws IOException, SAXException { + // verify that reading via InputStream (byte) works as well + try (InputStream fis = new FileInputStream(KEYBOARDS_MT); ) { + InputSource is = new InputSource(fis); + is.setSystemId(KEYBOARDS_MT); + is = DoctypeXmlStreamWrapper.wrap(is); + XMLFileReader.createXMLReader(-1, true, new LoggingHandler()).parse(is); + } + } + + @Test + void TestReadKeyboardChar() throws IOException, SAXException { + // verify that reading via Reader (char) works as well + try (InputStream fis = new FileInputStream(KEYBOARDS_MT); + InputStreamReader isr = new InputStreamReader(fis); ) { + InputSource is = new InputSource(isr); + is.setSystemId(KEYBOARDS_MT); + is = DoctypeXmlStreamWrapper.wrap(is); + XMLFileReader.createXMLReader(-1, true, new LoggingHandler()).parse(is); + } + } + + @ParameterizedTest(name = "[{index}] wrapped={arguments}") + @ValueSource(booleans = {false, true, false, true}) + public void TestBytePerf(boolean wrapped) throws IOException, SAXException { + for (int i = 0; i < COUNT; i++) { + // mimic XMLFileHandler.read() here, but with wrapping enabled/disabled + try (InputStream fis = new FileInputStream(COMMON_MT); ) { + InputSource is = new InputSource(fis); + is.setSystemId(COMMON_MT); + if (wrapped) is = DoctypeXmlStreamWrapper.wrap(is); + XMLFileReader.createXMLReader(-1, true, new LoggingHandler()).parse(is); + } + } + } + + @ParameterizedTest(name = "[{index}] wrapped={arguments}") + @ValueSource(booleans = {false, true, false, true}) + public void TestCharPerf(boolean wrapped) throws IOException, SAXException { + for (int i = 0; i < COUNT; i++) { + // mimic XMLFileHandler.read() here, but with wrapping enabled/disabled + try (InputStream fis = new FileInputStream(COMMON_MT); + InputStreamReader isr = new InputStreamReader(fis); ) { + InputSource is = new InputSource(isr); + is.setSystemId(COMMON_MT); + if (wrapped) is = DoctypeXmlStreamWrapper.wrap(is); + XMLFileReader.createXMLReader(-1, true, new LoggingHandler()).parse(is); + } + } + } +} diff --git a/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-missing.xml b/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-missing.xml index 0fa4b16e7a9..69c240a1c19 100644 --- a/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-missing.xml +++ b/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-missing.xml @@ -1,9 +1,5 @@ - - + diff --git a/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-unknownbase.xml b/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-unknownbase.xml index 8c4e7d694c2..4982742d91d 100644 --- a/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-unknownbase.xml +++ b/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-unknownbase.xml @@ -1,9 +1,5 @@ - - + diff --git a/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-unknownver.xml b/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-unknownver.xml index 322f2ac605c..ddd4045ee92 100644 --- a/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-unknownver.xml +++ b/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-unknownver.xml @@ -1,9 +1,5 @@ - - + diff --git a/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-wrongparent.xml b/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-wrongparent.xml index d27ea5f70cd..71ba07e115a 100644 --- a/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-wrongparent.xml +++ b/tools/cldr-code/src/test/resources/org/unicode/cldr/tool/KeyboardFlatten/broken-import-wrongparent.xml @@ -1,9 +1,5 @@ - - + diff --git a/tools/pom.xml b/tools/pom.xml index 216cdd8c183..77a110f8bb4 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -254,6 +254,25 @@ true -Xmx6g -enableassertions + + false + 3.0 + false + true + true + true + + + false + UTF-8 + false + + + false + false + true + true +