diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java
index 19f57e3c1..fa13ec9f0 100644
--- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java
+++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java
@@ -33,784 +33,21 @@
public class GenerateData implements UCD_Types {
- /* static final boolean DEBUG = false;
-
- static final String HORIZONTAL_LINE = "# ================================================";
-
- static final void genSplit () {
- UnicodeSet split = new UnicodeSet();
- UnicodeSet reordrant = new UnicodeSet(
- "[\u093F\u09BF\u09c7\u09c8\u0abf\u0abf\u0b47\u0bc6\u0bc7\u0bc8"
- + "\u0d46\u0d47\u0d48\u0dd9\u0dda\u0ddb\u1031\u17be\u17c1\u17c2\u17c3]");
- UnicodeSet subjoined = new UnicodeSet();
- for (int i = 0; i <= 0x10FFFF; ++i) {
- if (!Default.ucd().isAssigned(i)) continue;
- Utility.dot(i);
- int cat = Default.ucd().getCategory(i);
- if (cat != Mc && cat != Mn && cat != Me) continue;
- if (Default.ucd().getName(i).indexOf("SUBJOINED") >= 0) {
- System.out.print('*');
- subjoined.add(i);
- continue;
- }
- String decomp = Default.nfd().normalize(i);
- //int count = countTypes(decomp, Mc);
- if (UTF16.countCodePoint(decomp) > 1) split.add(i);
- }
- Utility.fixDot();
- System.out.println("Split: " + split.size());
- Utility.showSetNames("", split, false, Default.ucd());
-
- System.out.println("Reordrant: " + reordrant.size());
- Utility.showSetNames("", reordrant, false, Default.ucd());
-
- System.out.println("Subjoined: " + subjoined.size());
- Utility.showSetNames("", subjoined, false, Default.ucd());
- }
-
- static int countTypes(String s, int filter) {
- int count = 0;
- int cp;
- for (int i = 0; i < s.length(); i+= UTF16.getCharCount(cp)) {
- cp = UTF16.charAt(s, i);
- int cat = Default.ucd().getCategory(i);
- if (cat == filter) count++;
- }
- return count;
- }
-
- //static UnifiedBinaryProperty ubp
-
- public static void checkHoffman(String test) {
- String result = Default.nfkc().normalize(test);
- System.out.println(Utility.hex(test) + " => " + Utility.hex(result));
- System.out.println();
- show(test, 0);
- System.out.println();
- show(result, 0);
- }
-
- public static void show(String s, int indent) {
- int cp;
- for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
- cp = UTF32.char32At(s, i);
- String cc = " " + Default.ucd().getCombiningClass(cp);
- cc = Utility.repeat(" ", 4 - cc.length()) + cc;
- System.out.println(Utility.repeat(" ", indent) + Default.ucd().getCode(cp) + cc + " " + Default.ucd().getName(cp));
- String decomp = Default.nfkc().normalize(cp);
- if (!decomp.equals(UTF32.valueOf32(cp))) {
- show(decomp, indent + 4);
- }
- }
- }
-
-
- static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
-
- public static void doHeader(String fileName, PrintWriter output, int headerChoice) {
- output.println("# " + fileName);
- output.println(UnicodeDataFile.generateDateLine());
- output.println("#");
- if (headerChoice == HEADER_SCRIPTS) {
- } else if (headerChoice == HEADER_EXTEND) {
- output.println("# Unicode Character Database: Extended Properties");
- } else {
- output.println("# Unicode Character Database: Derived Property Data");
- output.println("# Generated algorithmically from the Unicode Character Database");
- }
- output.println("# For documentation, see UCD.html");
- //output.println("# Note: Unassigned and Noncharacter codepoints may be omitted");
- //output.println("# if they have default property values.");
- output.println(HORIZONTAL_LINE);
- output.println();
- }
-
- public static void checkDifferences (String targetVersion) throws IOException {
- System.out.println("Checking Differences");
- UCD target = UCD.make(targetVersion);
-
- PrintWriter log1 = Utility.openPrintWriter("Log1.xml", Utility.LATIN1_UNIX);
- log1.println("");
-
- PrintWriter log2 = Utility.openPrintWriter("Log2.xml", Utility.LATIN1_UNIX);
- log2.println("");
-
- for (int i = 0; i <= 0x10FFFF; ++i) {
- if (!target.isAllocated(i)) continue;
- Utility.dot(i);
- UData t = target.get(i, true);
- UData current = Default.ucd().get(i, true);
- if (i == 0x5E) {
- System.out.println(target.getDecompositionTypeID(i)
- + ", " + Utility.hex(target.getDecompositionMapping(i)));
- System.out.println(Default.ucd().getDecompositionTypeID(i)
- + ", " + Utility.hex(Default.ucd().getDecompositionMapping(i)));
- }
- if (t.equals(current)) continue;
-
- // print both for comparison
- log1.println(t.toString(target, UData.ABBREVIATED));
- log2.println(current.toString(Default.ucd(), UData.ABBREVIATED));
- }
- log1.println("");
- log2.println("");
- log1.close();
- log2.close();
- }
-
- public static void generateDerived (byte type, boolean checkTypeAndStandard, int headerChoice, String directory, String fileName) throws IOException {
-
-
- String newFile = directory + fileName + UnicodeDataFile.getFileSuffix(true);
- System.out.println("New File: " + newFile);
- PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
- String[] batName = {""};
- org.unicode.cldr.util.Utility.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName);
-
- doHeader(fileName + UnicodeDataFile.getFileSuffix(false), output, headerChoice);
- for (int i = 0; i < DERIVED_PROPERTY_LIMIT; ++i) {
- UCDProperty up = DerivedProperty.make(i, Default.ucd());
- if (up == null) continue;
- boolean keepGoing = true;
- if (!up.isStandard()) keepGoing = false;
- if ((up.getType() & type) == 0) keepGoing = false;
-
- if (checkTypeAndStandard != keepGoing) continue;
- //if ((bitMask & (1L< 2;
- this.type = type;
- }
-
- public String optionalComment(int cp) { return ""; }
-
- public String valueName(int cp) {
- return UTF32.length32(ucdData.getDecompositionMapping(cp)) + "";
- }
-
- public byte status(int cp) {
- if (getType(cp) == type) return INCLUDE;
- return EXCLUDE;
- }
-
- public int getType(int cp) {
- if (!ucdData.isAssigned(cp)) return -1;
- if (ucdData.getDecompositionType(cp) != CANONICAL) return -1;
-
- if (oldUCD.getBinaryProperty(cp, CompositionExclusion)) return 1;
- if (cp == 0xFB1D) return 1; // special
-
- String decomp = ucdData.getDecompositionMapping(cp);
- int len = UTF32.length32(decomp);
- if (len == 1) return 3;
- int first = UTF32.char32At(decomp,0);
- if (ucdData.getCombiningClass(first) != 0) return 4;
-
- if (oldUCD.getDecompositionType(cp) == CANONICAL) return -1;
- if (ucdData.getDecompositionType(cp) == CANONICAL) return 2;
-
- return -1;
- }
- }
-
- public static void generatePropertyAliases() throws IOException {
-
- String prop = "";
- String propAbb = "";
- String value = "";
- String valueAbb = "";
-
- Map duplicates = new TreeMap();
- Set sorted = new TreeSet(java.text.Collator.getInstance());
- Set accumulation = new TreeSet(java.text.Collator.getInstance());
-
- for (int k = 0; k < UCD_Names.NON_ENUMERATED_NAMES.length; ++k) {
- propAbb = Utility.getUnskeleton(UCD_Names.NON_ENUMERATED_NAMES[k][0], false);
- prop = Utility.getUnskeleton(UCD_Names.NON_ENUMERATED_NAMES[k][1], true);
-
- byte type = STRING_PROP;
- if (propAbb.equals("nv")) {
- type = NUMERIC_PROP;
- } else if (propAbb.equals("age")) {
- type = CATALOG_PROP;
- } else if (propAbb.equals("blk")) {
- type = CATALOG_PROP;
- } else if (propAbb.equals("na")) {
- type = MISC_PROP;
- } else if (propAbb.equals("na1")) {
- type = MISC_PROP;
- } else if (propAbb.equals("isc")) {
- type = MISC_PROP;
- }
- addLine(sorted, UCD_Names.PROP_TYPE_NAMES[type][1], propAbb, prop, null);
- checkDuplicate(duplicates, accumulation, propAbb, prop);
- if (!prop.equals(propAbb)) checkDuplicate(duplicates, accumulation, prop, prop);
- }
- addLine(sorted, UCD_Names.PROP_TYPE_NAMES[MISC_PROP][1], "URS", "Unicode_Radical_Stroke", null);
- // TODO: merge above
-
- for (int k = 0; k < UCD_Names.SUPER_CATEGORIES.length; ++k) {
- valueAbb = Utility.getUnskeleton(UCD_Names.SUPER_CATEGORIES[k][0], false);
- value = Utility.getUnskeleton(UCD_Names.SUPER_CATEGORIES[k][1], true);
- String extra = Utility.getUnskeleton(UCD_Names.SUPER_CATEGORIES[k][1], true);
- addLine(sorted, "gc", valueAbb, value, extra, "# " + UCD_Names.SUPER_CATEGORIES[k][2]);
- checkDuplicate(duplicates, accumulation, value, "General_Category=" + value);
- if (!value.equals(valueAbb)) checkDuplicate(duplicates, accumulation, valueAbb, "General_Category=" + value);
- if (extra != null) checkDuplicate(duplicates, accumulation, extra, "General_Category=" + value);
- }
-
-
- addLine(sorted, "xx; T ; True");
- checkDuplicate(duplicates, accumulation, "T", "xx=True");
- addLine(sorted, "xx; F ; False");
- checkDuplicate(duplicates, accumulation, "F", "xx=False");
-
- addLine(sorted, "qc", UCD_Names.YN_TABLE[1], UCD_Names.YN_TABLE_LONG[1], null);
- checkDuplicate(duplicates, accumulation, UCD_Names.YN_TABLE[1], "qc=" + UCD_Names.YN_TABLE_LONG[1]);
- addLine(sorted, "qc", UCD_Names.YN_TABLE[0], UCD_Names.YN_TABLE_LONG[0], null);
- checkDuplicate(duplicates, accumulation, UCD_Names.YN_TABLE[0], "qc=" + UCD_Names.YN_TABLE_LONG[0]);
- addLine(sorted, "qc", "M", "Maybe", null);
- checkDuplicate(duplicates, accumulation, "M", "qc=Maybe");
-
- addLine(sorted, "blk", "n/a", Utility.getUnskeleton("no block", true), null);
-
- for (int i = 0; i < LIMIT_ENUM; ++i) {
- int type = i & 0xFF00;
- if (type == AGE) continue;
- if (i == (BINARY_PROPERTIES | CaseFoldTurkishI)) continue;
- if (i == (BINARY_PROPERTIES | Non_break)) continue;
- if (i == (BINARY_PROPERTIES | Case_Sensitive)) continue;
-
- if (type == NUMERIC_TYPE) {
- //System.out.println("debug");
- }
-
- UCDProperty up = UnifiedBinaryProperty.make(i, Default.ucd());
- if (up == null) continue;
- if (!up.isStandard()) continue;
-
- // System.out.println("At" + Utility.hex(i));
-
- // Save the Type Name, under BB for binary
-
- if (type == i || type == BINARY_PROPERTIES || type == DERIVED) {
- if (propAbb.equals("") || propAbb.equals(UCD_Names.YN_TABLE[1])) {
- System.out.println("WHOOPS: " + Utility.hex(i));
- }
- propAbb = Utility.getUnskeleton(up.getPropertyName(SHORT), false);
- prop = Utility.getUnskeleton(up.getPropertyName(LONG), true);
- addLine(sorted,
- type == SCRIPT
- ? UCD_Names.PROP_TYPE_NAMES[CATALOG_PROP][1]
- : type != DERIVED && type != BINARY_PROPERTIES
- ? UCD_Names.PROP_TYPE_NAMES[ENUMERATED_PROP][1]
- : UCD_Names.PROP_TYPE_NAMES[up.getValueType()][1],
- propAbb, prop, null);
- checkDuplicate(duplicates, accumulation, propAbb, prop);
- if (!prop.equals(propAbb)) checkDuplicate(duplicates, accumulation, prop, prop);
- }
-
- if (up.getValueType() < BINARY_PROP) continue;
- value = up.getValue(LONG);
- if (value.length() == 0) value = "none";
- else if (value.equals(UnicodeProperty.UNUSED)) continue;
-
- if (type != DECOMPOSITION_TYPE) {
- value = Utility.getUnskeleton(value, true);
- }
-
- //if (type == DERIVED) {
- //System.out.println("Derived " + up.getProperty());
- //}
-
-
- if (type == SCRIPT) {
- value = Default.ucd().getCase(value, FULL, TITLE);
- }
-
- valueAbb = up.getValue(SHORT);
- valueAbb = Utility.getUnskeleton(valueAbb, false);
- if (valueAbb.length() == 0) valueAbb = "n/a";
- //else if (valueAbb.equals(value)) valueAbb = "n/a";
-
-
- if (type == COMBINING_CLASS) {
- if (value.charAt(0) <= '9') { continue; }
- }
-
-
- if (type == JOINING_GROUP) {
- valueAbb = "n/a";
- }
-
-
- String elide = "";
- if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{"
- + valueAbb
- + "}";
- String abb = "";
- if (type != BINARY_PROPERTIES) abb = "\\p{"
- + UCD_Names.ABB_UNIFIED_PROPERTIES[i>>8]
- + "="
- + valueAbb
- + "}";
- String norm = "";
- if (type != BINARY_PROPERTIES) norm = "\\p{"
- + UCD_Names.SHORT_UNIFIED_PROPERTIES[i>>8]
- + "="
- + value
- + "}";
- System.out.println("" + elide + " | " + abb + " | " + norm + " |
");
-
-
-
- if (type == BINARY_PROPERTIES || type == DERIVED) {
- //if (value.equals(YN_TABLE_LONG[1])) continue;
- addLine(sorted, PROP_TYPE_NAMES[BINARY][1], valueAbb, value);
- checkDuplicate(duplicates, accumulation, value, value);
- if (!value.equalsIgnoreCase(valueAbb)) checkDuplicate(duplicates, accumulation, valueAbb, value);
- continue;
- }
-
-
- if (type == COMBINING_CLASS) {
- String num = up.getValue(NUMBER);
- num = "; " + Utility.repeat(" ", 3-num.length()) + num;
- addLine(sorted, propAbb + num, valueAbb, value, null);
- } else if (!valueAbb.equals(UCD_Names.YN_TABLE[1])) {
- addLine(sorted, propAbb, valueAbb, value, null);
- }
- checkDuplicate(duplicates, accumulation, value, prop + "=" + value);
- if (!value.equalsIgnoreCase(valueAbb) && !valueAbb.equals("n/a")) {
- checkDuplicate(duplicates, accumulation, valueAbb, prop + "=" + value);
- }
- }
-
- Iterator blockIterator = Default.ucd().getBlockNames().iterator();
- while (blockIterator.hasNext()) {
- addLine(sorted, "blk", "n/a", (String)blockIterator.next(), null);
- }
-
- UCD.BlockData blockData = new UCD.BlockData();
-
- int blockId = 0;
- while (Default.ucd().getBlockData(blockId++, blockData)) {
- addLine(sorted, "blk", "n/a", blockData.name);
- }
-
-
- String filename = "PropertyAliases";
- String newFile = "DerivedData/" + filename + UnicodeDataFile.getFileSuffix(true);
- PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
- String[] batName = {""};
- String mostRecent = org.unicode.cldr.util.Utility.generateBat("DerivedData/", filename, UnicodeDataFile.getFileSuffix(true), batName);
-
- log.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
- log.println(UnicodeDataFile.generateDateLine());
- log.println("#");
- Utility.appendFile("PropertyAliasesHeader.txt", Utility.LATIN1, log);
- log.println(HORIZONTAL_LINE);
- log.println();
- int count = Utility.print(log, sorted, "\n", new MyBreaker(true));
- log.println();
- log.println();
- log.println(HORIZONTAL_LINE);
- log.println("# Total: \t" + count);
- log.println();
- log.close();
- Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
-
- filename = "PropertyValueAliases";
- newFile = "DerivedData/" + filename + UnicodeDataFile.getFileSuffix(true);
- log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
- mostRecent = org.unicode.cldr.util.Utility.generateBat("DerivedData/", filename, UnicodeDataFile.getFileSuffix(true), batName);
-
- log.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
- log.println(UnicodeDataFile.generateDateLine());
- log.println("#");
- Utility.appendFile("PropertyValueAliasesHeader.txt", Utility.LATIN1, log);
- log.println(HORIZONTAL_LINE);
- log.println();
- Utility.print(log, sorted, "\n", new MyBreaker(false));
- log.println();
- log.close();
- Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
-
- filename = "PropertyAliasSummary";
- newFile = "OtherData/" + filename + UnicodeDataFile.getFileSuffix(true);
- log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
- mostRecent = org.unicode.cldr.util.Utility.generateBat("OtherData/", filename, UnicodeDataFile.getFileSuffix(true), batName);
-
- log.println();
- log.println(HORIZONTAL_LINE);
- log.println();
- log.println("# Non-Unique names: the same name (under either an exact or loose match)");
- log.println("# occurs as a property name or property value name");
- log.println("# Note: no two property names can be the same,");
- log.println("# nor can two property value names for the same property be the same.");
- log.println();
- Utility.print(log, accumulation, "\n", new MyBreaker(false));
- log.println();
- log.close();
- Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
- }
-
- static void addLine(Set sorted, String f1, String f2, String f3, String f4) {
- addLine(sorted, f1, f2, f3, f4, null);
- }
-
- static void addLine(Set sorted, String f1, String f2, String f3, String f4, String comment) {
- //System.out.println("Adding: " + line);
- f1 += Utility.repeat(" ", 3 - f1.length());
- f1 += "; " + f2;
- f1 += Utility.repeat(" ", 15 - f1.length());
- f1 += "; " + f3;
- if (f4 != null) {
- f1 += Utility.repeat(" ", 30 - f1.length());
- f1 += f4;
- }
- if (comment != null) {
- f1 += Utility.repeat(" ", 50 - f1.length());
- f1 += comment;
- }
- sorted.add(f1);
- }
-
- static class MyBreaker implements Utility.Breaker {
- boolean status;
- int count;
-
- public MyBreaker(boolean status) {
- this.status = status;
- }
-
- public byte getType (String c) {
- for (byte i = 0; i <= BINARY_PROP; ++i) {
- if (c.startsWith(UCD_Names.PROP_TYPE_NAMES[i][1])) return i;
- }
- return UNKNOWN_PROP;
- }
-
- public boolean filter(Object current) {
- String c = current.toString();
- byte type = getType(c);
- if (type != UNKNOWN_PROP) return status;
- return !status;
- }
-
- public String get(Object current, Object old) {
- if (old == null) {
- old = " ";
- }
- String c = current.toString();
- String o = old.toString();
- String sep = "";
- if (!c.substring(0,2).equals(o.substring(0,2))) {
- sep = "\n";
- if (status) {
- byte type = getType(c);
- sep = sep + HORIZONTAL_LINE + sep + "# " + UCD_Names.PROP_TYPE_NAMES[type][0] + " Properties" + sep + HORIZONTAL_LINE + sep;
- }
- }
- if (status) {
- int pos = c.indexOf(';');
- c = c.substring(pos+1).trim();
- }
- return sep + c;
- }
- }
-
- static void checkDuplicate(Map m, Set accumulation, String toCheck, String originalComment) {
- toCheck = Utility.getSkeleton(toCheck);
- String comment = "{" + originalComment + "}";
-
- Set result = (Set) m.get(toCheck);
- if (result != null) {
- // Warn on serious problem: two property-names collide
- // or two property names & values collide.
- // examples:
- // if (1) "c" stood for both "General_Category" and "Combining_Class"
- // or if (2) "X=cc" stood for "X=control" and "X=compatibility"
- // 1: comment doesn't contain "=", and something in the results doesn't contain "="
- // 2: comment does contain "X=", and something else in results contains "X="
-
- int equalPos = comment.indexOf('=');
- if (equalPos < 0) { // #1
- String conflict = Utility.findSubstring("=", result, false);
- if (conflict != null) {
- System.out.println("Property Name Conflict " + toCheck);
- System.out.println(" With " + comment);
- System.out.println(" And " + conflict);
- }
- } else { // #2
- String trial = comment.substring(0,equalPos+1);
- String conflict = Utility.findSubstring(trial, result, true);
- if (conflict != null) {
- System.out.println("Property Value Name Conflict " + toCheck);
- System.out.println(" With " + comment);
- System.out.println(" And " + conflict);
- }
- }
-
- // accumulate differences
-
- String acc = (String)accumulation.get(toCheck);
- if (acc == null) {
- acc = "# \"" + toCheck + "\":\t" + originalComment;
- }
- acc += ";\t" + result;
-
- result.add(comment);
- accumulation.add("# " + result.toString() + ":\t" + toCheck);
- } else {
- result = new TreeSet();
- result.add(comment);
- m.put(toCheck, result);
- }
- }
-
- public static void generateVerticalSlice(int startEnum, int endEnum,
- int headerChoice, String directory, String file) throws IOException {
-
-
- String newFile = directory + file + UnicodeDataFile.getFileSuffix(true);
- PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
- String[] batName = {""};
- String mostRecent = org.unicode.cldr.util.Utility.generateBat(directory, file, UnicodeDataFile.getFileSuffix(true), batName);
-
- doHeader(file + UnicodeDataFile.getFileSuffix(false), output, headerChoice);
- int last = -1;
- for (int i = startEnum; i < endEnum; ++i) {
- UCDProperty up = UnifiedBinaryProperty.make(i, Default.ucd());
- if (up == null) continue;
- if (up.skipInDerivedListing()) continue;
-
-
- if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
- || i == (BINARY_PROPERTIES | Non_break)
- || i == (BINARY_PROPERTIES | CaseFoldTurkishI)
- || i == (HANGUL_SYLLABLE_TYPE | NA)
- || i == (JOINING_TYPE | JT_U)
- || i == (JOINING_GROUP | NO_SHAPING)
- ) continue; // skip zero case
-
- if (skipSpecial == SKIP_SPECIAL
- && i >= (BINARY_PROPERTIES | CompositionExclusion)
- && i < (AGE + NEXT_ENUM)) continue;
-
- if ((last & 0xFF00) != (i & 0xFF00) && (i <= BINARY_PROPERTIES || i >= SCRIPT)) {
- output.println();
- output.println(HORIZONTAL_LINE);
- output.println("# " + up.getHeader());
- output.println(HORIZONTAL_LINE);
- output.println();
- System.out.println();
- System.out.println(up.getHeader());
- last = i;
- } else {
- output.println(HORIZONTAL_LINE);
- output.println();
- }
- System.out.print(".");
- if (DEBUG) System.out.println(i);
- new MyPropertyLister(Default.ucd(), i, output).print();
- output.flush();
- }
- if (endEnum == LIMIT_ENUM) {
- output.println();
- output.println(HORIZONTAL_LINE);
- output.println("# Numeric Values (from UnicodeData.txt, field 6/7/8)");
- output.println(HORIZONTAL_LINE);
- output.println();
- System.out.println();
- System.out.println("@NUMERIC VALUES");
-
- Set numericValueSet = new TreeSet();
- for (int i = 0; i < 0x10FFFF; ++i) {
- double nv = Default.ucd().getNumericValue(i);
- if (Double.isNaN(nv)) continue;
- numericValueSet.add(new Double(nv));
- }
- Iterator it = numericValueSet.iterator();
- while(it.hasNext()) {
- new MyFloatLister(Default.ucd(), ((Double)it.next()).doubleValue(), output).print();
- output.println();
- System.out.print(".");
- }
- output.flush();
- }
- output.close();
- //System.out.println("HERE");
- Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
- System.out.println();
- }
-
- */
public static void writeNormalizerTestSuite(String directory, String fileName)
throws IOException {
+ final var nfd = Default.nfd();
+ final var nfc = Default.nfc();
+ final var nfkd = Default.nfkd();
final UnicodeDataFile fc =
UnicodeDataFile.openAndWriteHeader(directory, fileName)
.setSkipCopyright(Settings.SKIP_COPYRIGHT);
final PrintWriter log = fc.out;
- // final String suffix = FileInfix.getDefault().getFileSuffix(".txt");
- // final String newFile = directory + fileName + suffix;
- // PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
- // String[] batName = {""};
- // String mostRecent = org.unicode.cldr.util.Utility.generateBat(directory, fileName,
- // UnicodeDataFile.getFileSuffix(true), batName);
-
final String[] example = new String[256];
- // log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false));
- // log.println(UnicodeDataFile.generateDateLine());
- // log.println("#");
- // log.println("# Normalization Test Suite");
- // log.println("# Format:");
- // log.println("#");
- // log.println("# Columns (c1, c2,...) are separated by semicolons");
- // log.println("# Comments are indicated with hash marks");
- // log.println("#");
- // log.println("# CONFORMANCE:");
- // log.println("# 1. The following invariants must be true for all conformant
- // implementations");
- // log.println("#");
- // log.println("# NFC");
- // log.println("# c2 == NFC(c1) == NFC(c2) == NFC(c3)");
- // log.println("# c4 == NFC(c4) == NFC(c5)");
- // log.println("#");
- // log.println("# NFD");
- // log.println("# c3 == NFD(c1) == NFD(c2) == NFD(c3)");
- // log.println("# c5 == NFD(c4) == NFD(c5)");
- // log.println("#");
- // log.println("# NFKC");
- // log.println("# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) ==
- // NFKC(c5)");
- // log.println("#");
- // log.println("# NFKD");
- // log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) ==
- // NFKD(c5)");
- // log.println("#");
- // log.println("# 2. For every code point X assigned in this version of Unicode that
- // is not specifically");
- // log.println("# listed in Part 1, the following invariants must be true for all
- // conformant");
- // log.println("# implementations:");
- // log.println("#");
- // log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
-
System.out.println("Writing Part 1");
- // log.println("#");
- // log.println("@Part0 # Specific cases");
- // log.println("#");
-
for (final String testSuiteCase : testSuiteCases) {
writeLine(testSuiteCase, log, false);
}
@@ -829,8 +66,7 @@ public static void writeNormalizerTestSuite(String directory, String fileName)
final org.unicode.props.UnicodeProperty sc =
IndexUnicodeProperties.make().getProperty(UcdProperty.Script);
for (final String cp : UnicodeSet.ALL_CODE_POINTS) {
- final String[] decompositions =
- new String[] {Default.nfd().normalize(cp), Default.nfkd().normalize(cp)};
+ final String[] decompositions = new String[] {nfd.normalize(cp), nfkd.normalize(cp)};
for (final String decomposition : decompositions) {
final int lastCCC =
Default.ucd()
@@ -971,7 +207,6 @@ public static void writeNormalizerTestSuite(String directory, String fileName)
if (Default.ucd().getDecompositionType(ch) != CANONICAL) {
continue;
}
- // if (!Default.nfc().isNormalized(ch)) continue;
final String s = Default.ucd().getDecompositionMapping(ch);
if (UTF16.hasMoreCodePointsThan(s, 2)) {
continue;
@@ -1032,7 +267,7 @@ public static void writeNormalizerTestSuite(String directory, String fileName)
for (var entry : canonicalDecompositionsByCodepoint.entrySet()) {
final int cp = entry.getKey();
final String decomposition = entry.getValue();
- if (Default.nfc().normalize(cp).equals(Character.toString(cp))) {
+ if (nfc.normalize(cp).equals(Character.toString(cp))) {
int first = decomposition.codePointAt(0);
int last = decomposition.codePointBefore(decomposition.length());
primaryCompositesByFirstNFDCodePointBuilder
@@ -1107,8 +342,8 @@ public static void writeNormalizerTestSuite(String directory, String fileName)
}
for (int firstCandidate : firstCandidates) {
for (int secondCandidate : secondCandidates) {
- String firstDecomposition = Default.nfd().normalize(firstCandidate);
- String secondDecomposition = Default.nfd().normalize(secondCandidate);
+ String firstDecomposition = nfd.normalize(firstCandidate);
+ String secondDecomposition = nfd.normalize(secondCandidate);
String decomposition = firstDecomposition + secondDecomposition;
if (canonicalDecompositionsOfSingleCodepoints.contains(decomposition)) {
// Already covered in parts 1 (single code points) and 4
@@ -1123,16 +358,13 @@ public static void writeNormalizerTestSuite(String directory, String fileName)
// firstCandidate and secondCandidate, look for strings that
// cannot be split between those two characters.
// Those are our test cases for Part 5.
- String nfc = Default.nfc().normalize(decomposition);
+ String normalizedFormC = nfc.normalize(decomposition);
forAllStringsCanonicallyDecomposingTo(
decomposition,
s -> {
for (int j = 0; j < s.length(); ++j) {
- if (Default.nfd()
- .normalize(s.substring(0, j))
- .equals(firstDecomposition)
- && Default.nfd()
- .normalize(s.substring(j))
+ if (nfd.normalize(s.substring(0, j)).equals(firstDecomposition)
+ && nfd.normalize(s.substring(j))
.equals(secondDecomposition)) {
// The string splits into parts
// equivalent to firstCandidate and
@@ -1141,7 +373,7 @@ public static void writeNormalizerTestSuite(String directory, String fileName)
return;
}
}
- if (s.equals(nfc)) {
+ if (s.equals(normalizedFormC)) {
// If the NFC of
// firstCandidate + secondCandidate has a
// link, thus
@@ -1162,7 +394,7 @@ public static void writeNormalizerTestSuite(String directory, String fileName)
// correct, not tried it.” (Knuth 1977), let
// us check those statements.
String linkDecomposition =
- Default.nfd().normalize(nfc.codePointAt(0));
+ nfd.normalize(normalizedFormC.codePointAt(0));
if (!linkDecomposition.startsWith(firstDecomposition)) {
throw new AssertionError(
"The first code point of NFC("
@@ -1171,10 +403,10 @@ public static void writeNormalizerTestSuite(String directory, String fileName)
+ Default.ucd().getName(secondDecomposition)
+ ") does not cover the first part");
}
- skippedNFCs.add(nfc);
+ skippedNFCs.add(normalizedFormC);
return;
}
- part5NFCs.add(nfc);
+ part5NFCs.add(normalizedFormC);
writeLine(s, log, true);
System.out.println(Default.ucd().getName(s));
});
@@ -1182,11 +414,11 @@ public static void writeNormalizerTestSuite(String directory, String fileName)
}
}
- for (String nfc : skippedNFCs) {
- if (!part5NFCs.contains(nfc)) {
+ for (String normalizedFormC : skippedNFCs) {
+ if (!part5NFCs.contains(normalizedFormC)) {
throw new AssertionError(
"Candidate Part 5 test case "
- + Default.ucd().getName(nfc)
+ + Default.ucd().getName(normalizedFormC)
+ " was suppressed but did not appear as the NFC of another test"
+ " case in Part 5.");
}
@@ -1197,7 +429,6 @@ public static void writeNormalizerTestSuite(String directory, String fileName)
log.println("#");
log.println("# EOF");
fc.close();
- // Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}
private static final ImmutableMap canonicalDecompositionsByCodepoint;
@@ -1261,27 +492,6 @@ static void forAllStrings(
}
}
- /*
-
- static void handleIdentical() throws IOException {
- DirectoryIterator target = new DirectoryIterator(GEN_DIR + File.separator + "DerivedData");
- DirectoryIterator.RootFileFilter filter = new DirectoryIterator.RootFileFilter("");
- DirectoryIterator recent = new DirectoryIterator(UCD_DIR, filter);
- while (true) {
- File targetFile = target.next();
- if (targetFile == null) break;
- recent.reset();
- filter.setRoot(DirectoryIterator.getRoot(targetFile));
- File lastFile = recent.next();
- if (lastFile == null) break;
- System.out.println("Target: " + targetFile);
- System.out.println("Last: " + lastFile);
- if (!DirectoryIterator.isAlmostIdentical(targetFile, lastFile, true)) continue;
- System.out.println("Almost Identical");
- }
- }
-
- */
static void writeLine(String cc, PrintWriter log, boolean check) {
final String c = Default.nfc().normalize(cc);
String d = Default.nfd().normalize(cc);
@@ -1370,349 +580,4 @@ static final String comma(String s) {
"\u1100\uAC00\u11A8",
"\u1100\uAC00\u11A8\u11A8",
};
- /*
- static final void backwardsCompat(String directory, String filename, int[] list) throws IOException {
-
-
- String newFile = directory + filename + UnicodeDataFile.getFileSuffix(true);
- PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
- String[] batName = {""};
- String mostRecent = org.unicode.cldr.util.Utility.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
- DiffPropertyLister dpl;
- UnicodeSet cummulative = new UnicodeSet();
-
- try {
- for (int i = 0; i < list.length; ++i) {
- int prop = list[i];
- log.println();
- log.println(HORIZONTAL_LINE);
- log.println("###### " + DerivedProperty.make(prop, Default.ucd()).getName());
- //log.println();
- //log.println(HORIZONTAL_LINE);
- //new DiffPropertyLister("3.2.0", "1.1.0", log, prop).print();
- log.println();
- log.println(HORIZONTAL_LINE);
-
- log.println();
- dpl = new DiffPropertyLister("3.2.0", "2.0.0", log, prop);
- dpl.print();
- cummulative.addAll(dpl.getSet());
- log.println(HORIZONTAL_LINE);
-
- log.println();
- dpl = new DiffPropertyLister("3.2.0", "2.1.2", log, prop);
- dpl.print();
- cummulative.addAll(dpl.getSet());
- log.println(HORIZONTAL_LINE);
-
- log.println();
- dpl = new DiffPropertyLister("3.2.0", "2.1.5", log, prop);
- dpl.print();
- cummulative.addAll(dpl.getSet());
- log.println(HORIZONTAL_LINE);
-
- log.println();
- dpl = new DiffPropertyLister("3.2.0", "2.1.8", log, prop);
- dpl.print();
- cummulative.addAll(dpl.getSet());
- log.println(HORIZONTAL_LINE);
-
- log.println();
- dpl = new DiffPropertyLister("3.2.0", "3.0.0", log, prop);
- dpl.print();
- cummulative.addAll(dpl.getSet());
- log.println(HORIZONTAL_LINE);
-
- log.println();
- dpl = new DiffPropertyLister("3.2.0", "3.0.1", log, prop);
- dpl.print();
- cummulative.addAll(dpl.getSet());
- log.println(HORIZONTAL_LINE);
-
- log.println();
- dpl = new DiffPropertyLister("3.2.0", "3.1.0", log, prop);
- dpl.print();
- cummulative.addAll(dpl.getSet());
- log.println(HORIZONTAL_LINE);
-
- log.println();
- dpl = new DiffPropertyLister("3.2.0", "3.1.1", log, prop);
- dpl.print();
- cummulative.addAll(dpl.getSet());
- log.println(HORIZONTAL_LINE);
-
- log.println();
- log.println("Cummulative differences");
- UCDProperty up = DerivedProperty.make(prop, Default.ucd());
- UnicodeSet newProp = up.getSet();
- Utility.showSetNames(log, "", cummulative.removeAll(newProp), false, false, Default.ucd());
- }
- } finally {
- if (log != null) {
- log.close();
- Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
- }
- }
- }
-
- static final void generateAge(String directory, String filename) throws IOException {
-
- String newFile = directory + filename + UnicodeDataFile.getFileSuffix(true);
- PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
- String[] batName = {""};
- String mostRecent = org.unicode.cldr.util.Utility.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
- try {
- log.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
- log.println(UnicodeDataFile.generateDateLine());
- log.println("#");
- log.println("# Unicode Character Database: Derived Property Data");
- log.println("# This file shows when various code points were designated in Unicode");
- log.println("# Notes:");
- log.println("# - The term 'designated' means that a previously reserved code point was specified");
- log.println("# to be a noncharacter or surrogate, or assigned as a character,");
- log.println("# control or format code.");
- log.println("# - Versions are only tracked from 1.1 onwards, since version 1.0");
- log.println("# predated changes required by the ISO 10646 merger.");
- log.println("# - The Hangul Syllables that were removed from 2.0 are not included in the 1.1 listing.");
- log.println("# - The supplementary private use code points and the non-character code points");
- log.println("# were designated in version 2.0, but not specifically listed in the UCD");
- log.println("# until versions 3.0 and 3.1 respectively.");
- log.println("#");
- log.println("# For details on the contents of each version, see");
- log.println("# http://www.unicode.org/versions/enumeratedversions.html.");
-
- // http://www.unicode.org/versions/enumeratedversions.html
-
- log.println(HORIZONTAL_LINE);
- log.println();
- new DiffPropertyLister(null, "1.1.0", log).print();
- log.println(HORIZONTAL_LINE);
- log.println();
- new DiffPropertyLister("1.1.0", "2.0.0", log).print();
- log.println(HORIZONTAL_LINE);
- log.println();
- new DiffPropertyLister("2.0.0", "2.1.2", log).print();
- log.println(HORIZONTAL_LINE);
- log.println();
- new DiffPropertyLister("2.1.2", "3.0.0", log).print();
- log.println(HORIZONTAL_LINE);
- log.println();
- new DiffPropertyLister("3.0.0", "3.1.0", log).print();
- log.println(HORIZONTAL_LINE);
- log.println();
- new DiffPropertyLister("3.1.0", "3.2.0", log).print();
- log.println(HORIZONTAL_LINE);
- log.println();
- new DiffPropertyLister("3.2.0", "4.0.0", log).print();
-
- printDiff("110", "200");
- UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false);
- UnicodeSet u20 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.0.txt", false);
- UnicodeSet u21 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.1.txt", false);
- UnicodeSet u30 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.0.txt", false);
- UnicodeSet u31 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.1.txt", false);
-
- log.println();
- log.println("# Code points assigned in Unicode 1.1 (minus Hangul Syllables): "
- + n.format(u11.count()));
- log.println();
- u11.print(log, false, false, "1.1");
-
- UnicodeSet u20m = new UnicodeSet(u20).remove(u11);
- log.println();
- log.println("# Code points assigned in Unicode 2.0 (minus Unicode 1.1): "
- + n.format(u20m.count()));
- log.println();
- u20m.print(log, false, false, "2.0");
-
- UnicodeSet u21m = new UnicodeSet(u21).remove(u20);
- log.println();
- log.println("# Code points assigned in Unicode 2.1 (minus Unicode 2.0): "
- + n.format(u21m.count()));
- log.println();
- u21m.print(log, false, false, "2.1");
-
- UnicodeSet u30m = new UnicodeSet(u30).remove(u21);
- log.println();
- log.println("# Code points assigned in Unicode 3.0 (minus Unicode 2.1): "
- + n.format(u30m.count()));
- log.println();
- u30m.print(log, false, false, "3.0");
-
- UnicodeSet u31m = new UnicodeSet(u31).remove(u30);
- log.println();
- log.println("# Code points assigned in Unicode 3.1 (minus Unicode 3.0): "
- + n.format(u31m.count()));
- log.println();
- u31m.print(log, false, false, "3.1");
-
- } finally {
- if (log != null) {
- log.close();
- Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
- }
- }
-
- }
-
- public static void listCombiningAccents() throws IOException {
-
- PrintWriter log = Utility.openPrintWriter("ListAccents" + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX);
- Set set = new TreeSet();
- Set set2 = new TreeSet();
-
- for (int i = 0; i < 0x10FFFF; ++i) {
- Utility.dot(i);
- if (!Default.ucd().isRepresented(i)) continue;
-
- if (Default.nfd().isNormalized(i)) {
- if (Default.ucd().getScript(i) == LATIN_SCRIPT) {
- int cp = i;
- String hex = "u" + Utility.hex(cp, 4);
- set.add("# yyy $x <> \\" + hex + " ; # " + Default.ucd().getName(cp));
- }
- continue;
- }
-
- String decomp = Default.nfd().normalize(i);
- int j;
- for (j = 0; j < decomp.length(); j += UTF16.getCharCount(i)) {
- int cp = UTF16.charAt(decomp, j);
- byte cat = Default.ucd().getCategory(cp);
- if (cat != Mn) continue;
- String hex = "u" + Utility.hex(cp, 4);
- set.add("# xxx $x <> \\" + hex + " ; # " + Default.ucd().getName(cp));
- }
- }
-
- Iterator it = set.iterator();
- while (it.hasNext()) {
- log.println(it.next());
- }
- log.close();
- }
-
- public static void listGreekVowels() throws IOException {
-
- PrintWriter log = Utility.openPrintWriter("ListGreekVowels" + UnicodeDataFile.getFileSuffix(true), Utility.LATIN1_UNIX);
- Set set = new TreeSet();
- Set set2 = new TreeSet();
-
- String vowels = "\u03B1\u03B5\u03B7\u03B9\u03BF\u03C5\u03C9\u0391\u0395\u0397\u0399\u039F\u03A5\u03A9";
- String diphthongEnd = "\u03B9\u03C5\u0399\u03A5";
- String diphthongStart = "\u03B1\u03B5\u03B7\u03BF\u03C5\u0391\u0395\u0397\u039F\u03A5";
- String etas = "\u03B7\u0397";
- String iotas = "\u03B9\u0399";
-
- for (char i = 0; i < 0xFFFF; ++i) {
- Utility.dot(i);
- if (!Default.ucd().isRepresented(i)) continue;
- if (Default.ucd().getScript(i) != GREEK_SCRIPT) continue;
- String decomp = Default.nfd().normalize(i);
-
- if (decomp.indexOf('\u0306') >= 0) continue; // skip breve
- if (decomp.indexOf('\u0304') >= 0) continue; // skip macron
-
- String comp = Default.nfc().normalize(decomp);
- if (!comp.equals(String.valueOf(i))) continue; // skip compats
-
- char first = decomp.charAt(0);
-
- if (vowels.indexOf(first) < 0) continue;
-
- String h = "";
- if (decomp.indexOf('\u0314') >= 0) h = "\uFFFF";
-
- if (diphthongEnd.indexOf(first) >= 0) {
- for (int j = 0; j < diphthongStart.length(); ++j) {
- String v = diphthongStart.substring(j, j+1);
- char vc = v.charAt(0);
- if (Default.ucd().getCategory(vc) == Ll && Default.ucd().getCategory(first) == Lu) continue;
- if (etas.indexOf(vc) >= 0 && iotas.indexOf(first) >= 0) continue;
- set.add(new Pair(h + v + first, new Pair(v + decomp, v + i)));
- }
- }
- set.add(new Pair(h+first, new Pair(decomp, String.valueOf(i))));
- }
-
- Iterator it = set.iterator();
- Object last = "";
- while (it.hasNext()) {
- Pair p = (Pair) it.next();
- if (!last.equals(p.first)) {
- log.println();
- last = p.first;
- } else {
- log.print(", ");
- }
- p = (Pair) p.second;
- log.print(p.second);
- }
- log.close();
- }
-
- public static void listKatakana() throws IOException {
-
-
- for (char i = 'a'; i <= 'z'; ++i) {
- doKana(String.valueOf(i));
- if (i == 'c') doKana("ch");
- if (i == 's') doKana("sh");
- if (i == 'd') {
- doKana("dz");
- doKana("dj");
- }
- }
-
- System.out.println();
- }
-
- public static void doKana(String i) {
-
- String vowels = "aeiou";
- System.out.println();
- System.out.print(i + " " + i + i);
- System.out.println();
- for (int j = 0; j < vowels.length(); ++j) {
- char c = vowels.charAt(j);
- System.out.print(" " + i + c);
- }
-
- System.out.println();
- for (int j = 0; j < vowels.length(); ++j) {
- char c = vowels.charAt(j);
- System.out.print(" " + i + "y" + c);
- }
- }
-
- public static void genTrailingZeros() {
-
- UnicodeSet result = new UnicodeSet();
- for (int i = 0; i < 0x10FFFF; ++i) {
- if ((i & 0xFFF) == 0) System.out.println("# " + i);
- if (!Default.ucd().isAssigned(i)) continue;
- if (Default.nfd().isNormalized(i)) continue;
- String decomp = Default.nfd().normalize(i);
- int cp;
- for (int j = 0; j < decomp.length(); j += UTF16.getCharCount(cp)) {
- cp = UTF16.charAt(decomp,j);
- if (j == 0) continue; // skip first
- if (Default.ucd().getCombiningClass(cp) == 0) {
- result.add(cp);
- }
- }
- }
- int rangeCount = result.getRangeCount();
- for (int k = 0; k < rangeCount; ++k) {
- int start = result.getRangeStart(k);
- int end = result.getRangeEnd(k);
- System.out.println(
- Utility.hex(start)
- + (start != end ? ".." + Utility.hex(end) : "")
- + "; "
- + Default.ucd().getName(start)
- + (start != end ? ".." + Default.ucd().getName(end) : ""));
- }
- System.out.println("TrailingZero count: " + result.size());
- }*/
}