From d9b2073d932052cdcb836c317c0b91d1e0858d83 Mon Sep 17 00:00:00 2001 From: macchiati Date: Tue, 30 Jul 2024 20:54:44 -0700 Subject: [PATCH 1/3] CLDR-17844 Modify the date report --- .../unicode/cldr/util/CodePointEscaper.java | 22 ++- .../unicode/cldr/util/DateTimeFormats.java | 140 ++++++++++++++++-- 2 files changed, 145 insertions(+), 17 deletions(-) diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java index 711d16895fa..9b6c304024a 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java @@ -2,6 +2,7 @@ import com.ibm.icu.impl.UnicodeMap; import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import java.util.Locale; @@ -19,10 +20,13 @@ public enum CodePointEscaper { LF(0xA, "line feed"), CR(0xD, "carriage return"), SP(0x20, "space", "ASCII space"), - NSP(0x2009, "narrow/thin space", "Also known as ‘thin space’"), + TSP(0x2009, "thin space", "Aka ‘narrow space’"), NBSP(0xA0, "no-break space", "Same as space, but doesn’t line wrap."), - NNBSP(0x202F, "narrow/thin no-break space", "Same as narrow space, but doesn’t line wrap."), + NBTSP( + 0x202F, + "no-break thin space", + "Same as thin space, but doesn’t line wrap. Aka 'narrow no-break space'"), WNJ( 0x200B, @@ -147,6 +151,11 @@ public int getCodePoint() { return codePoint; } + /** Return the string form of the code point for this character. */ + public String getString() { + return UTF16.valueOf(codePoint); + } + /** Returns the escaped form from the code point for this enum */ public String codePointToEscaped() { return ESCAPE_START + rawCodePointToEscaped(codePoint) + ESCAPE_END; @@ -196,6 +205,15 @@ public static String toEscaped(String unescaped, UnicodeSet toEscape) { }); return result.toString(); } + + public static String getEscaped(int cp, UnicodeSet toEscape) { + if (!toEscape.contains(cp)) { + return UTF16.valueOf(cp); + } else { + return codePointToEscaped(cp); + } + } + /** Return unescaped string */ public static String toUnescaped(String escaped) { if (escaped == null) { diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java index e14b0044046..a7f299a967e 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java @@ -1,7 +1,9 @@ package org.unicode.cldr.util; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; import com.ibm.icu.impl.Row.R3; +import com.ibm.icu.text.Bidi; import com.ibm.icu.text.DateFormat; import com.ibm.icu.text.DateIntervalFormat; import com.ibm.icu.text.DateIntervalInfo; @@ -13,6 +15,7 @@ import com.ibm.icu.text.MessageFormat; import com.ibm.icu.text.SimpleDateFormat; import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.SpanCondition; import com.ibm.icu.util.Calendar; import com.ibm.icu.util.DateInterval; import com.ibm.icu.util.ICUUncheckedIOException; @@ -45,6 +48,15 @@ import org.unicode.cldr.util.SupplementalDataInfo.PluralInfo.Count; public class DateTimeFormats { + private static final UnicodeSet TO_ESCAPE = + new UnicodeSet(CodePointEscaper.FORCE_ESCAPE) + .remove(CodePointEscaper.SP.getCodePoint()) + .remove(CodePointEscaper.TSP.getCodePoint()) + .remove(CodePointEscaper.NBSP.getCodePoint()) + .remove(CodePointEscaper.NBTSP.getCodePoint()) + .freeze(); + private static final String MISSING_PART = "ⓜⓘⓢⓢⓘⓝⓖ"; + private static final CLDRConfig CONFIG = CLDRConfig.getInstance(); private static final Date SAMPLE_DATE_DEFAULT_END = new Date(2099 - 1900, 0, 13, 14, 45, 59); private static final String DIR = CLDRPaths.CHART_DIRECTORY + "/verify/dates/"; private static SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); @@ -76,9 +88,19 @@ enum MyOptions { // constant sets should // probably be moved to a common file of such things. private static final UnicodeSet BIDI_MARKS = new UnicodeSet("[:Bidi_Control:]").freeze(); - private static final String exampleSep = "
"; + + private static final String ltrBackground = "background-color:#EEE;"; + private static final String tableBackground = " background-color:#DDF;"; + private static final String rtlStart = "
"; - private static final String rtlEnd = "
"; + private static final String ltrStart = "
"; + private static final String divEnd = "
"; + private static final String tableStyle = + "style='border-collapse: collapse;" + tableBackground + " margin: auto'"; + + private static final String ltrSpan = ""; + private static final String tableSpan = ""; + private static final String spanEnd = ""; private static final String[] STOCK = {"short", "medium", "long", "full"}; private static final String[] CALENDAR_FIELD_TO_PATTERN_LETTER = { @@ -125,7 +147,7 @@ enum MyOptions { private ULocale locale; private ICUServiceBuilder icuServiceBuilder; private ICUServiceBuilder icuServiceBuilderEnglish = - new ICUServiceBuilder().setCldrFile(CLDRConfig.getInstance().getEnglish()); + new ICUServiceBuilder().setCldrFile(CONFIG.getEnglish()); private DateIntervalInfo dateIntervalInfo = new DateIntervalInfo(); private String calendarID; @@ -133,8 +155,7 @@ enum MyOptions { private boolean isRTL; private static String surveyUrl = - CLDRConfig.getInstance() - .getProperty("CLDR_SURVEY_URL", "http://st.unicode.org/cldr-apps/survey"); + CONFIG.getProperty("CLDR_SURVEY_URL", "http://st.unicode.org/cldr-apps/survey"); /** * Set a CLDRFile and calendar. Must be done before calling addTable. @@ -493,7 +514,33 @@ public boolean isPresent(String skeleton) { public void addTable(DateTimeFormats comparison, Appendable output) { try { output.append( - "

" + hackDoubleLinked("Patterns") + "

\n"); + "

" + + hackDoubleLinked("Patterns") + + "

" + + "

Normally, there is one line containing an example in each Native Example cell. " + + (!isRTL + ? "" + : "However, two examples are provided if the locale is right-to-left, like Arabic or Hebrew, " + + "and the paragraph direction can cause a different display. " + + "The first has a RTL paragraph direction, " + + "while the second has a LTR paragraph direction " + + ltrSpan + + "and a different background" + + spanEnd + + ". If the display of either example causes strings of letters or numbers to collide, " + + "then a ⚠️ is shown. ") + + "When an example has hidden characters, then " + + tableSpan + + "an extra line" + + spanEnd + + " shows those characters " + + "such as ❰RLM❱ for the invisible Right-to-Left Mark. " + + "So that the ordering of the characters in memory is clear, they are presented left-to-right one at a time. " + + "so that the placement is clear. " + + "When a pattern (or a component of a pattern) is missing, it is displayed as " + + MISSING_PART + + ".

" + + "\n
"); Diff diff = new Diff(); boolean is24h = generator.getDefaultHourFormatChar() == 'H'; showRow( @@ -502,7 +549,7 @@ public void addTable(DateTimeFormats comparison, Appendable output) { FIELDS_TITLE, "Skeleton", "English Example", - "Native Example (neutral context,
then RTL if relevant)", + "Native Example", false); for (String[] nameAndSkeleton : NAME_AND_PATTERN) { String name = nameAndSkeleton[0]; @@ -615,12 +662,76 @@ private String getExample(String skeleton) { } } String transformedExample = TransliteratorUtilities.toHTML.transform(example); - if (isRTL || BIDI_MARKS.containsSome(transformedExample)) { - transformedExample += exampleSep + rtlStart + transformedExample + rtlEnd; + if ((isRTL || BIDI_MARKS.containsSome(example)) && !example.contains(MISSING_PART)) { + Bidi bidiLTR = new Bidi(example, Bidi.DIRECTION_LEFT_TO_RIGHT); + String orderedLTR = bidiLTR.writeReordered(0); + Bidi bidiRTL = new Bidi(example, Bidi.DIRECTION_RIGHT_TO_LEFT); + String orderedRTL = bidiRTL.writeReordered(0); + if (!orderedLTR.equals(orderedRTL)) { + // since this is RTL, we put it first + String rtlVersion = rtlStart + transformedExample + divEnd; + String ltrVersion = ltrStart + transformedExample + divEnd; // colored + Set fieldsLTR = getFields(orderedLTR); + Set fieldsRTL = getFields(orderedRTL); + String alert = fieldsLTR.equals(fieldsRTL) ? "" : " ⚠️ "; + transformedExample = rtlVersion + ltrVersion + alert; + } + } + + if (TO_ESCAPE.containsSome(example)) { + StringBuilder processed = new StringBuilder(); + example.codePoints() + .forEach( + x -> { + processed + .append(""); + }); + + transformedExample += "
") + .append( + TransliteratorUtilities.toHTML.transform( + CodePointEscaper.getEscaped(x, TO_ESCAPE))) + .append("
" + processed + "
"; } return transformedExample; } + /** + * Return a list of the fields, where each span is a sequence of: + * + * + * + * @param orderedLTR + * @return + */ + static final UnicodeSet NUMBERS = new UnicodeSet("\\p{N}").freeze(); + + static final UnicodeSet LETTERS_MARKS = new UnicodeSet("[\\p{L}\\p{M}]").freeze(); + static final UnicodeSet OTHERS = + new UnicodeSet(NUMBERS).addAll(LETTERS_MARKS).complement().freeze(); + static final Set ALL = ImmutableSet.of(NUMBERS, LETTERS_MARKS, OTHERS); + + private Set getFields(String ordered) { + Set result = + new LinkedHashSet<>(); // doesn't have to be a LHS, but helps with debugging + int start = 0; + while (start < ordered.length()) { + for (UnicodeSet us : ALL) { + int end = us.span(ordered, start, SpanCondition.CONTAINED); + if (end != start) { + result.add(ordered.substring(start, end)); + start = end; + break; + } + } + } + return result; + } + static final Pattern RELATIVE_DATE = PatternCache.get("®([a-z]+(?:-[a-z]+)?)+(-[a-z]+)?([+-]?\\d+)([a-zA-Z]+)?"); @@ -677,7 +788,7 @@ private String getRelativeExampleFromSkeleton(String skeleton) { RelativePattern rp = new RelativePattern(file, skeleton); String value = rp.value; if (value == null) { - value = "ⓜⓘⓢⓢⓘⓝⓖ"; + value = MISSING_PART; } else { DecimalFormat format = icuServiceBuilder.getNumberFormat(0); value = value.replace("{0}", format.format(Math.abs(rp.offset)).replace("'", "''")); @@ -988,10 +1099,9 @@ public static void main(String[] args) throws IOException { String organization = MyOptions.organization.option.getValue(); String filter = MyOptions.filter.option.getValue(); - Factory englishFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, filter); - CLDRFile englishFile = englishFactory.make("en", true); + CLDRFile englishFile = CONFIG.getEnglish(); - Factory factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, LOCALES); + Factory factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, filter); System.out.println("Total locales: " + factory.getAvailableLanguages().size()); DateTimeFormats english = new DateTimeFormats().set(englishFile, "gregorian"); @@ -1004,7 +1114,7 @@ public static void main(String[] args) throws IOException { Map sorted = new TreeMap<>(); SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); Set defaultContent = sdi.getDefaultContentLocales(); - for (String localeID : factory.getAvailableLanguages()) { + for (String localeID : factory.getAvailable()) { Level level = StandardCodes.make().getLocaleCoverageLevel(organization, localeID); if (Level.MODERN.compareTo(level) > 0) { continue; @@ -1047,7 +1157,7 @@ public static void main(String[] args) throws IOException { + name + "" + "

Index

\n" - + "

The following chart shows typical usage of date and time formatting with the Gregorian calendar. " + + "

The following chart shows typical usage of date and time formatting with the Gregorian calendar and default number system. " + "There is important information on Date/Time Review, " + "so please read that page before starting!

\n"); formats.addTable(english, out); From 2d8ad0b31276f9337e61c02550fe8794a313ce1c Mon Sep 17 00:00:00 2001 From: macchiati Date: Wed, 31 Jul 2024 06:17:09 -0700 Subject: [PATCH 2/3] CLDR-17844 Change to auto vs RTL comparison --- .../unicode/cldr/util/DateTimeFormats.java | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java index a7f299a967e..60d38bbdac8 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java @@ -90,13 +90,14 @@ enum MyOptions { private static final UnicodeSet BIDI_MARKS = new UnicodeSet("[:Bidi_Control:]").freeze(); private static final String ltrBackground = "background-color:#EEE;"; - private static final String tableBackground = " background-color:#DDF;"; + private static final String tableBackground = "background-color:#DDF; border: 1px solid blue;"; private static final String rtlStart = "
"; - private static final String ltrStart = "
"; + private static final String autoLtrStart = "
"; + private static final String autoStart = "
"; private static final String divEnd = "
"; private static final String tableStyle = - "style='border-collapse: collapse;" + tableBackground + " margin: auto'"; + "style='border-collapse: collapse;" + tableBackground + " margin: auto'"; // private static final String ltrSpan = ""; private static final String tableSpan = ""; @@ -522,8 +523,8 @@ public void addTable(DateTimeFormats comparison, Appendable output) { ? "" : "However, two examples are provided if the locale is right-to-left, like Arabic or Hebrew, " + "and the paragraph direction can cause a different display. " - + "The first has a RTL paragraph direction, " - + "while the second has a LTR paragraph direction " + + "The first has a RTL paragraph direction, " + + "while the second has a auto paragraph direction (LTR unless the first 'strong' character is RTL) " + ltrSpan + "and a different background" + spanEnd @@ -663,18 +664,21 @@ private String getExample(String skeleton) { } String transformedExample = TransliteratorUtilities.toHTML.transform(example); if ((isRTL || BIDI_MARKS.containsSome(example)) && !example.contains(MISSING_PART)) { - Bidi bidiLTR = new Bidi(example, Bidi.DIRECTION_LEFT_TO_RIGHT); + Bidi bidiLTR = new Bidi(example, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); String orderedLTR = bidiLTR.writeReordered(0); Bidi bidiRTL = new Bidi(example, Bidi.DIRECTION_RIGHT_TO_LEFT); String orderedRTL = bidiRTL.writeReordered(0); if (!orderedLTR.equals(orderedRTL)) { // since this is RTL, we put it first - String rtlVersion = rtlStart + transformedExample + divEnd; - String ltrVersion = ltrStart + transformedExample + divEnd; // colored + String rtlVersion = rtlStart + transformedExample + divEnd; // not colored + String autoVersion = autoLtrStart + transformedExample + divEnd; // colored Set fieldsLTR = getFields(orderedLTR); Set fieldsRTL = getFields(orderedRTL); String alert = fieldsLTR.equals(fieldsRTL) ? "" : " ⚠️ "; - transformedExample = rtlVersion + ltrVersion + alert; + transformedExample = rtlVersion + autoVersion + alert; + } else { + String autoVersion = autoStart + transformedExample + divEnd; // not colored + transformedExample = autoVersion; } } From 595dde363d36cecd07b135d8ec8cef83c1774eb4 Mon Sep 17 00:00:00 2001 From: macchiati Date: Thu, 1 Aug 2024 06:43:41 -0700 Subject: [PATCH 3/3] CLDR-17844 cleanup --- .../java/org/unicode/cldr/util/BidiUtils.java | 162 ++++++++++++++++++ .../unicode/cldr/util/CodePointEscaper.java | 55 +++++- .../unicode/cldr/util/DateTimeFormats.java | 97 ++++------- 3 files changed, 249 insertions(+), 65 deletions(-) create mode 100644 tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java new file mode 100644 index 00000000000..dc7e9f2761b --- /dev/null +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java @@ -0,0 +1,162 @@ +package org.unicode.cldr.util; + +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; +import com.google.common.collect.Sets.SetView; +import com.ibm.icu.text.Bidi; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.SpanCondition; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; + +/** + * A set of utilities for handling BIDI, especially in charts and examples but not restricted to + * that. + */ +public class BidiUtils { + public static final String ALERT = "⚠️"; + static final String LRM = CodePointEscaper.LRM.getString(); + + // These are intended to be classes of characters that "stick together in order" + // The initial focus is dates, so this will probably need to be expanded for numbers; might need + // more syntax + + private enum SpanClass { + NUMBERS("\\p{N}"), + LETTERS_MARKS("[\\p{L}\\p{M}]"), + DATE_PUNCT("[+]"), + SPACES("\\p{Z}"), + OTHERS("\\p{any}") // must be last, to pick up remainder. + ; + final UnicodeSet uset; + + private SpanClass(String unicodeSetSource) { + uset = new UnicodeSet(unicodeSetSource); + } + + static { + // clean up by removing previous values + UnicodeSet soFar = new UnicodeSet(); + for (SpanClass sc : SpanClass.values()) { + sc.uset.removeAll(soFar).freeze(); + soFar.addAll(sc.uset); + } + } + } + /** + * Checks the ordering of the example, under the specified bidiDirectionOptions; + * + * @param example Source text, not HTMLified + * @param outputReorderedResults One string for each specified bidiDirectionOption + * @param bidiDirectionOptions an array of BIDI directions from com.ibm.icu.text.Bidi. if there + * are no items, the default is DIRECTION_DEFAULT_LEFT_TO_RIGHT (dir="auto"), + * DIRECTION_RIGHT_TO_LEFT (dir="rtl"). + * @return true unless two or more of the resulting strings are different. + */ + public static boolean isOrderingUnchanged( + String example, List outputReorderedResults, int... bidiDirectionOptions) { + boolean hasList = outputReorderedResults != null; + if (!hasList) { + outputReorderedResults = new ArrayList<>(); + } else { + outputReorderedResults.clear(); + } + boolean result = true; + for (int count = 0; count < bidiDirectionOptions.length; ++count) { + String reordered = new Bidi(example, bidiDirectionOptions[count]).writeReordered(0); + outputReorderedResults.add(reordered); + if (result && count != 0 && !reordered.equals(outputReorderedResults.get(0))) { + result = false; + if (!hasList) { + break; // if the output results are not needed, then stop. + } + } + } + return result; + } + + /** + * Return a list of the , where each span is a sequence of: + * + * @param orderedLTR + * @return + */ + /** + * Gets the 'fields' in a formatted string, used to test whether bidi reordering causes the + * original fields to merge when reordered. Each field is the longest contiguous span of + * characters with the same properties: * + * + *
    + *
  • numbers (\p{N}) + *
  • letters & marks ([\p{L}\p{M} + *
  • Other + *
+ * + * @param ordered + * @return a set of fields, in the same order as found in the text but duplicates removed (ike + * LinkedHashSeet). + */ + public static Set getFields(String reordred, Set result) { + int start = 0; + while (start < reordred.length()) { + for (SpanClass sc : SpanClass.values()) { + int end = sc.uset.span(reordred, start, SpanCondition.CONTAINED); + if (end != start) { + result.add(reordred.substring(start, end)); + start = end; + break; + } + } + } + return ImmutableSet.copyOf(result); + } + + /** + * Show when the fields in strings are different + * + * @param bidiReordereds + * @return + */ + public static String getAlert(List bidiReordereds) { + Set> results = new LinkedHashSet<>(); + for (String bidiReordered : bidiReordereds) { + Set fieldsLTR = BidiUtils.getFields(bidiReordered, new TreeSet<>()); + results.add(fieldsLTR); + } + if (results.size() < 2) { + return ""; + } + // there can still be differences within a field of OTHERS, that we ignore. + // EG ⚠️ 20,28,2B; 2B,28,20 " (+" vs " (+" + + // show just the difference in the first 2, for now. + Iterator> it = results.iterator(); + Set first = it.next(); + Set second = it.next(); + SetView uniqueFirst = Sets.difference(first, second); + SetView uniqueSecond = Sets.difference(second, first); + return ALERT + " " + escape(uniqueFirst) + "; " + escape(uniqueSecond); + } + + public static String escape(Set uniqueFirst) { + return uniqueFirst.stream() + .map(x -> CodePointEscaper.toEscaped(x)) + .collect(Collectors.joining(LRM + ", " + LRM, LRM, LRM)); + } + + public static String alphagram(String string) { + return string.codePoints() + .sorted() + .collect( + StringBuilder::new, // Supplier supplier + StringBuilder::appendCodePoint, // ObjIntConsumer accumulator + StringBuilder::append // BiConsumer combiner + ) + .toString(); + } +} diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java index 9b6c304024a..04d030b7a19 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java @@ -1,6 +1,7 @@ package org.unicode.cldr.util; import com.ibm.icu.impl.UnicodeMap; +import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; @@ -114,9 +115,7 @@ public enum CodePointEscaper { private final String description; private CodePointEscaper(int codePoint, String shortName) { - this.codePoint = codePoint; - this.shortName = shortName; - this.description = ""; + this(codePoint, shortName, ""); } private CodePointEscaper(int codePoint, String shortName, String description) { @@ -291,4 +290,54 @@ public static String rawCodePointToEscaped(int codePoint) { ? Integer.toString(codePoint, 16).toUpperCase(Locale.ROOT) : result.toString(); } + + public static final String getHtmlRows( + UnicodeSet escapesToShow, String tableOptions, String cellOptions) { + if (!escapesToShow.strings().isEmpty()) { + throw new IllegalArgumentException("No strings allowed in the unicode set."); + } + StringBuilder result = new StringBuilder(""); + UnicodeSet remaining = new UnicodeSet(escapesToShow); + String tdPlus = ""; + for (CodePointEscaper cpe : CodePointEscaper.values()) { + int cp = cpe.getCodePoint(); + remaining.remove(cp); + if (escapesToShow.contains(cpe.getCodePoint())) { + final String id = cpe.name(); + final String shortName = cpe.getShortName(); + final String description = cpe.getDescription(); + addREsult(result, tdPlus, id, shortName, description); + } + } + for (String cps : remaining) { + int cp = cps.codePointAt(0); + final String extendedName = UCharacter.getExtendedName(cp); + addREsult( + result, + tdPlus, + Utility.hex(cp, 2), + "", + extendedName == null ? "" : extendedName.toLowerCase()); + } + return result.append("").toString(); + } + + public static void addREsult( + StringBuilder result, + String tdPlus, + final String id, + final String shortName, + final String description) { + result.append("") + .append(tdPlus) + .append(ESCAPE_START) + .append(id) + .append(ESCAPE_END + "") + .append(tdPlus) + .append(shortName) + .append("") + .append(tdPlus) + .append(description) + .append(""); + } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java index 60d38bbdac8..87b368f9bfd 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java @@ -1,7 +1,6 @@ package org.unicode.cldr.util; import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; import com.ibm.icu.impl.Row.R3; import com.ibm.icu.text.Bidi; import com.ibm.icu.text.DateFormat; @@ -15,7 +14,6 @@ import com.ibm.icu.text.MessageFormat; import com.ibm.icu.text.SimpleDateFormat; import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.SpanCondition; import com.ibm.icu.util.Calendar; import com.ibm.icu.util.DateInterval; import com.ibm.icu.util.ICUUncheckedIOException; @@ -25,6 +23,7 @@ import java.io.File; import java.io.IOException; import java.io.PrintWriter; +import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.EnumSet; @@ -51,9 +50,6 @@ public class DateTimeFormats { private static final UnicodeSet TO_ESCAPE = new UnicodeSet(CodePointEscaper.FORCE_ESCAPE) .remove(CodePointEscaper.SP.getCodePoint()) - .remove(CodePointEscaper.TSP.getCodePoint()) - .remove(CodePointEscaper.NBSP.getCodePoint()) - .remove(CodePointEscaper.NBTSP.getCodePoint()) .freeze(); private static final String MISSING_PART = "ⓜⓘⓢⓢⓘⓝⓖ"; private static final CLDRConfig CONFIG = CLDRConfig.getInstance(); @@ -513,12 +509,13 @@ public boolean isPresent(String skeleton) { * @param output */ public void addTable(DateTimeFormats comparison, Appendable output) { + UnicodeSet allEscapedCharactersFound = new UnicodeSet(); try { output.append( "

" + hackDoubleLinked("Patterns") + "

" - + "

Normally, there is one line containing an example in each Native Example cell. " + + "

Normally, there is a single line containing an example in each Native Example cell. " + (!isRTL ? "" : "However, two examples are provided if the locale is right-to-left, like Arabic or Hebrew, " @@ -528,14 +525,13 @@ public void addTable(DateTimeFormats comparison, Appendable output) { + ltrSpan + "and a different background" + spanEnd - + ". If the display of either example causes strings of letters or numbers to collide, " - + "then a ⚠️ is shown. ") + + ". If the display of either example appears to cause strings of letters or numbers to collide, " + + "then a ⚠️ is shown followed by differences (this is still experimental). ") + "When an example has hidden characters, then " + tableSpan + "an extra line" + spanEnd - + " shows those characters " - + "such as ❰RLM❱ for the invisible Right-to-Left Mark. " + + " shows those characters with short IDs ❰…❱: see the Key below the table. " + "So that the ordering of the characters in memory is clear, they are presented left-to-right one at a time. " + "so that the placement is clear. " + "When a pattern (or a component of a pattern) is missing, it is displayed as " @@ -572,8 +568,8 @@ public void addTable(DateTimeFormats comparison, Appendable output) { RowStyle.normal, name, skeleton, - comparison.getExample(skeleton), - getExample(skeleton), + comparison.getExample(skeleton, allEscapedCharactersFound), + getExample(skeleton, allEscapedCharactersFound), diff.isPresent(skeleton)); } } @@ -611,12 +607,21 @@ public void addTable(DateTimeFormats comparison, Appendable output) { RowStyle.normal, skeleton, skeleton, - comparison.getExample(skeleton), - getExample(skeleton), + comparison.getExample(skeleton, allEscapedCharactersFound), + getExample(skeleton, allEscapedCharactersFound), true); } } output.append(""); + if (!allEscapedCharactersFound.isEmpty()) { + output.append("\n

Key to Escaped Characters

\n"); + String keyToEscaped = + CodePointEscaper.getHtmlRows( + allEscapedCharactersFound, + " style='border:1px solid blue; border-collapse: collapse'", + " style='border:1px solid blue'"); + output.append(keyToEscaped); + } } catch (IOException e) { throw new ICUUncheckedIOException(e); } @@ -626,9 +631,10 @@ public void addTable(DateTimeFormats comparison, Appendable output) { * Get an example from the "enhanced" skeleton. * * @param skeleton + * @param escapedCharactersFound Any characters that were escaped are added to this. * @return */ - private String getExample(String skeleton) { + private String getExample(String skeleton, UnicodeSet escapedCharactersFound) { String example; if (skeleton.contains("®")) { example = getRelativeExampleFromSkeleton(skeleton); @@ -663,18 +669,17 @@ private String getExample(String skeleton) { } } String transformedExample = TransliteratorUtilities.toHTML.transform(example); + ArrayList listOfReorderings = new ArrayList<>(); if ((isRTL || BIDI_MARKS.containsSome(example)) && !example.contains(MISSING_PART)) { - Bidi bidiLTR = new Bidi(example, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); - String orderedLTR = bidiLTR.writeReordered(0); - Bidi bidiRTL = new Bidi(example, Bidi.DIRECTION_RIGHT_TO_LEFT); - String orderedRTL = bidiRTL.writeReordered(0); - if (!orderedLTR.equals(orderedRTL)) { - // since this is RTL, we put it first + if (!BidiUtils.isOrderingUnchanged( + example, + listOfReorderings, + Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT, + Bidi.DIRECTION_RIGHT_TO_LEFT)) { + // since this locale is RTL, we put it first String rtlVersion = rtlStart + transformedExample + divEnd; // not colored String autoVersion = autoLtrStart + transformedExample + divEnd; // colored - Set fieldsLTR = getFields(orderedLTR); - Set fieldsRTL = getFields(orderedRTL); - String alert = fieldsLTR.equals(fieldsRTL) ? "" : " ⚠️ "; + String alert = BidiUtils.getAlert(listOfReorderings); transformedExample = rtlVersion + autoVersion + alert; } else { String autoVersion = autoStart + transformedExample + divEnd; // not colored @@ -696,46 +701,11 @@ private String getExample(String skeleton) { }); transformedExample += "" + processed + "
"; + escapedCharactersFound.addAll(new UnicodeSet().addAll(example).retainAll(TO_ESCAPE)); } return transformedExample; } - /** - * Return a list of the fields, where each span is a sequence of: - * - *
    - *
  • numbers (\p{N}) - *
  • letters & marks ([\p{L}\p{M} - *
  • Other - *
- * - * @param orderedLTR - * @return - */ - static final UnicodeSet NUMBERS = new UnicodeSet("\\p{N}").freeze(); - - static final UnicodeSet LETTERS_MARKS = new UnicodeSet("[\\p{L}\\p{M}]").freeze(); - static final UnicodeSet OTHERS = - new UnicodeSet(NUMBERS).addAll(LETTERS_MARKS).complement().freeze(); - static final Set ALL = ImmutableSet.of(NUMBERS, LETTERS_MARKS, OTHERS); - - private Set getFields(String ordered) { - Set result = - new LinkedHashSet<>(); // doesn't have to be a LHS, but helps with debugging - int start = 0; - while (start < ordered.length()) { - for (UnicodeSet us : ALL) { - int end = us.span(ordered, start, SpanCondition.CONTAINED); - if (end != start) { - result.add(ordered.substring(start, end)); - start = end; - break; - } - } - } - return result; - } - static final Pattern RELATIVE_DATE = PatternCache.get("®([a-z]+(?:-[a-z]+)?)+(-[a-z]+)?([+-]?\\d+)([a-zA-Z]+)?"); @@ -1102,11 +1072,14 @@ public static void main(String[] args) throws IOException { String organization = MyOptions.organization.option.getValue(); String filter = MyOptions.filter.option.getValue(); + boolean hasFilter = MyOptions.filter.option.doesOccur(); CLDRFile englishFile = CONFIG.getEnglish(); Factory factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, filter); - System.out.println("Total locales: " + factory.getAvailableLanguages().size()); + final Set availableLocales = + hasFilter ? factory.getAvailable() : factory.getAvailableLanguages(); + System.out.println("Total locales: " + availableLocales.size()); DateTimeFormats english = new DateTimeFormats().set(englishFile, "gregorian"); new File(DIR).mkdirs(); @@ -1118,7 +1091,7 @@ public static void main(String[] args) throws IOException { Map sorted = new TreeMap<>(); SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); Set defaultContent = sdi.getDefaultContentLocales(); - for (String localeID : factory.getAvailable()) { + for (String localeID : availableLocales) { Level level = StandardCodes.make().getLocaleCoverageLevel(organization, localeID); if (Level.MODERN.compareTo(level) > 0) { continue;