diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java new file mode 100644 index 00000000000..dc7e9f2761b --- /dev/null +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java @@ -0,0 +1,162 @@ +package org.unicode.cldr.util; + +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; +import com.google.common.collect.Sets.SetView; +import com.ibm.icu.text.Bidi; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.SpanCondition; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; + +/** + * A set of utilities for handling BIDI, especially in charts and examples but not restricted to + * that. + */ +public class BidiUtils { + public static final String ALERT = "⚠️"; + static final String LRM = CodePointEscaper.LRM.getString(); + + // These are intended to be classes of characters that "stick together in order" + // The initial focus is dates, so this will probably need to be expanded for numbers; might need + // more syntax + + private enum SpanClass { + NUMBERS("\\p{N}"), + LETTERS_MARKS("[\\p{L}\\p{M}]"), + DATE_PUNCT("[+]"), + SPACES("\\p{Z}"), + OTHERS("\\p{any}") // must be last, to pick up remainder. + ; + final UnicodeSet uset; + + private SpanClass(String unicodeSetSource) { + uset = new UnicodeSet(unicodeSetSource); + } + + static { + // clean up by removing previous values + UnicodeSet soFar = new UnicodeSet(); + for (SpanClass sc : SpanClass.values()) { + sc.uset.removeAll(soFar).freeze(); + soFar.addAll(sc.uset); + } + } + } + /** + * Checks the ordering of the example, under the specified bidiDirectionOptions; + * + * @param example Source text, not HTMLified + * @param outputReorderedResults One string for each specified bidiDirectionOption + * @param bidiDirectionOptions an array of BIDI directions from com.ibm.icu.text.Bidi. if there + * are no items, the default is DIRECTION_DEFAULT_LEFT_TO_RIGHT (dir="auto"), + * DIRECTION_RIGHT_TO_LEFT (dir="rtl"). + * @return true unless two or more of the resulting strings are different. + */ + public static boolean isOrderingUnchanged( + String example, List outputReorderedResults, int... bidiDirectionOptions) { + boolean hasList = outputReorderedResults != null; + if (!hasList) { + outputReorderedResults = new ArrayList<>(); + } else { + outputReorderedResults.clear(); + } + boolean result = true; + for (int count = 0; count < bidiDirectionOptions.length; ++count) { + String reordered = new Bidi(example, bidiDirectionOptions[count]).writeReordered(0); + outputReorderedResults.add(reordered); + if (result && count != 0 && !reordered.equals(outputReorderedResults.get(0))) { + result = false; + if (!hasList) { + break; // if the output results are not needed, then stop. + } + } + } + return result; + } + + /** + * Return a list of the , where each span is a sequence of: + * + * @param orderedLTR + * @return + */ + /** + * Gets the 'fields' in a formatted string, used to test whether bidi reordering causes the + * original fields to merge when reordered. Each field is the longest contiguous span of + * characters with the same properties: * + * + * + * + * @param ordered + * @return a set of fields, in the same order as found in the text but duplicates removed (ike + * LinkedHashSeet). + */ + public static Set getFields(String reordred, Set result) { + int start = 0; + while (start < reordred.length()) { + for (SpanClass sc : SpanClass.values()) { + int end = sc.uset.span(reordred, start, SpanCondition.CONTAINED); + if (end != start) { + result.add(reordred.substring(start, end)); + start = end; + break; + } + } + } + return ImmutableSet.copyOf(result); + } + + /** + * Show when the fields in strings are different + * + * @param bidiReordereds + * @return + */ + public static String getAlert(List bidiReordereds) { + Set> results = new LinkedHashSet<>(); + for (String bidiReordered : bidiReordereds) { + Set fieldsLTR = BidiUtils.getFields(bidiReordered, new TreeSet<>()); + results.add(fieldsLTR); + } + if (results.size() < 2) { + return ""; + } + // there can still be differences within a field of OTHERS, that we ignore. + // EG ⚠️ 20,28,2B; 2B,28,20 " (+" vs " (+" + + // show just the difference in the first 2, for now. + Iterator> it = results.iterator(); + Set first = it.next(); + Set second = it.next(); + SetView uniqueFirst = Sets.difference(first, second); + SetView uniqueSecond = Sets.difference(second, first); + return ALERT + " " + escape(uniqueFirst) + "; " + escape(uniqueSecond); + } + + public static String escape(Set uniqueFirst) { + return uniqueFirst.stream() + .map(x -> CodePointEscaper.toEscaped(x)) + .collect(Collectors.joining(LRM + ", " + LRM, LRM, LRM)); + } + + public static String alphagram(String string) { + return string.codePoints() + .sorted() + .collect( + StringBuilder::new, // Supplier supplier + StringBuilder::appendCodePoint, // ObjIntConsumer accumulator + StringBuilder::append // BiConsumer combiner + ) + .toString(); + } +} diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java index 711d16895fa..04d030b7a19 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java @@ -1,7 +1,9 @@ package org.unicode.cldr.util; import com.ibm.icu.impl.UnicodeMap; +import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import java.util.Locale; @@ -19,10 +21,13 @@ public enum CodePointEscaper { LF(0xA, "line feed"), CR(0xD, "carriage return"), SP(0x20, "space", "ASCII space"), - NSP(0x2009, "narrow/thin space", "Also known as ‘thin space’"), + TSP(0x2009, "thin space", "Aka ‘narrow space’"), NBSP(0xA0, "no-break space", "Same as space, but doesn’t line wrap."), - NNBSP(0x202F, "narrow/thin no-break space", "Same as narrow space, but doesn’t line wrap."), + NBTSP( + 0x202F, + "no-break thin space", + "Same as thin space, but doesn’t line wrap. Aka 'narrow no-break space'"), WNJ( 0x200B, @@ -110,9 +115,7 @@ public enum CodePointEscaper { private final String description; private CodePointEscaper(int codePoint, String shortName) { - this.codePoint = codePoint; - this.shortName = shortName; - this.description = ""; + this(codePoint, shortName, ""); } private CodePointEscaper(int codePoint, String shortName, String description) { @@ -147,6 +150,11 @@ public int getCodePoint() { return codePoint; } + /** Return the string form of the code point for this character. */ + public String getString() { + return UTF16.valueOf(codePoint); + } + /** Returns the escaped form from the code point for this enum */ public String codePointToEscaped() { return ESCAPE_START + rawCodePointToEscaped(codePoint) + ESCAPE_END; @@ -196,6 +204,15 @@ public static String toEscaped(String unescaped, UnicodeSet toEscape) { }); return result.toString(); } + + public static String getEscaped(int cp, UnicodeSet toEscape) { + if (!toEscape.contains(cp)) { + return UTF16.valueOf(cp); + } else { + return codePointToEscaped(cp); + } + } + /** Return unescaped string */ public static String toUnescaped(String escaped) { if (escaped == null) { @@ -273,4 +290,54 @@ public static String rawCodePointToEscaped(int codePoint) { ? Integer.toString(codePoint, 16).toUpperCase(Locale.ROOT) : result.toString(); } + + public static final String getHtmlRows( + UnicodeSet escapesToShow, String tableOptions, String cellOptions) { + if (!escapesToShow.strings().isEmpty()) { + throw new IllegalArgumentException("No strings allowed in the unicode set."); + } + StringBuilder result = new StringBuilder(""); + UnicodeSet remaining = new UnicodeSet(escapesToShow); + String tdPlus = ""; + for (CodePointEscaper cpe : CodePointEscaper.values()) { + int cp = cpe.getCodePoint(); + remaining.remove(cp); + if (escapesToShow.contains(cpe.getCodePoint())) { + final String id = cpe.name(); + final String shortName = cpe.getShortName(); + final String description = cpe.getDescription(); + addREsult(result, tdPlus, id, shortName, description); + } + } + for (String cps : remaining) { + int cp = cps.codePointAt(0); + final String extendedName = UCharacter.getExtendedName(cp); + addREsult( + result, + tdPlus, + Utility.hex(cp, 2), + "", + extendedName == null ? "" : extendedName.toLowerCase()); + } + return result.append("").toString(); + } + + public static void addREsult( + StringBuilder result, + String tdPlus, + final String id, + final String shortName, + final String description) { + result.append("") + .append(tdPlus) + .append(ESCAPE_START) + .append(id) + .append(ESCAPE_END + "") + .append(tdPlus) + .append(shortName) + .append("") + .append(tdPlus) + .append(description) + .append(""); + } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java index e14b0044046..87b368f9bfd 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/DateTimeFormats.java @@ -2,6 +2,7 @@ import com.google.common.collect.ImmutableMap; import com.ibm.icu.impl.Row.R3; +import com.ibm.icu.text.Bidi; import com.ibm.icu.text.DateFormat; import com.ibm.icu.text.DateIntervalFormat; import com.ibm.icu.text.DateIntervalInfo; @@ -22,6 +23,7 @@ import java.io.File; import java.io.IOException; import java.io.PrintWriter; +import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.EnumSet; @@ -45,6 +47,12 @@ import org.unicode.cldr.util.SupplementalDataInfo.PluralInfo.Count; public class DateTimeFormats { + private static final UnicodeSet TO_ESCAPE = + new UnicodeSet(CodePointEscaper.FORCE_ESCAPE) + .remove(CodePointEscaper.SP.getCodePoint()) + .freeze(); + private static final String MISSING_PART = "ⓜⓘⓢⓢⓘⓝⓖ"; + private static final CLDRConfig CONFIG = CLDRConfig.getInstance(); private static final Date SAMPLE_DATE_DEFAULT_END = new Date(2099 - 1900, 0, 13, 14, 45, 59); private static final String DIR = CLDRPaths.CHART_DIRECTORY + "/verify/dates/"; private static SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); @@ -76,9 +84,20 @@ enum MyOptions { // constant sets should // probably be moved to a common file of such things. private static final UnicodeSet BIDI_MARKS = new UnicodeSet("[:Bidi_Control:]").freeze(); - private static final String exampleSep = "
"; + + private static final String ltrBackground = "background-color:#EEE;"; + private static final String tableBackground = "background-color:#DDF; border: 1px solid blue;"; + private static final String rtlStart = "
"; - private static final String rtlEnd = "
"; + private static final String autoLtrStart = "
"; + private static final String autoStart = "
"; + private static final String divEnd = "
"; + private static final String tableStyle = + "style='border-collapse: collapse;" + tableBackground + " margin: auto'"; // + + private static final String ltrSpan = ""; + private static final String tableSpan = ""; + private static final String spanEnd = ""; private static final String[] STOCK = {"short", "medium", "long", "full"}; private static final String[] CALENDAR_FIELD_TO_PATTERN_LETTER = { @@ -125,7 +144,7 @@ enum MyOptions { private ULocale locale; private ICUServiceBuilder icuServiceBuilder; private ICUServiceBuilder icuServiceBuilderEnglish = - new ICUServiceBuilder().setCldrFile(CLDRConfig.getInstance().getEnglish()); + new ICUServiceBuilder().setCldrFile(CONFIG.getEnglish()); private DateIntervalInfo dateIntervalInfo = new DateIntervalInfo(); private String calendarID; @@ -133,8 +152,7 @@ enum MyOptions { private boolean isRTL; private static String surveyUrl = - CLDRConfig.getInstance() - .getProperty("CLDR_SURVEY_URL", "http://st.unicode.org/cldr-apps/survey"); + CONFIG.getProperty("CLDR_SURVEY_URL", "http://st.unicode.org/cldr-apps/survey"); /** * Set a CLDRFile and calendar. Must be done before calling addTable. @@ -491,9 +509,35 @@ public boolean isPresent(String skeleton) { * @param output */ public void addTable(DateTimeFormats comparison, Appendable output) { + UnicodeSet allEscapedCharactersFound = new UnicodeSet(); try { output.append( - "

" + hackDoubleLinked("Patterns") + "

\n"); + "

" + + hackDoubleLinked("Patterns") + + "

" + + "

Normally, there is a single line containing an example in each Native Example cell. " + + (!isRTL + ? "" + : "However, two examples are provided if the locale is right-to-left, like Arabic or Hebrew, " + + "and the paragraph direction can cause a different display. " + + "The first has a RTL paragraph direction, " + + "while the second has a auto paragraph direction (LTR unless the first 'strong' character is RTL) " + + ltrSpan + + "and a different background" + + spanEnd + + ". If the display of either example appears to cause strings of letters or numbers to collide, " + + "then a ⚠️ is shown followed by differences (this is still experimental). ") + + "When an example has hidden characters, then " + + tableSpan + + "an extra line" + + spanEnd + + " shows those characters with short IDs ❰…❱: see the Key below the table. " + + "So that the ordering of the characters in memory is clear, they are presented left-to-right one at a time. " + + "so that the placement is clear. " + + "When a pattern (or a component of a pattern) is missing, it is displayed as " + + MISSING_PART + + ".

" + + "\n
"); Diff diff = new Diff(); boolean is24h = generator.getDefaultHourFormatChar() == 'H'; showRow( @@ -502,7 +546,7 @@ public void addTable(DateTimeFormats comparison, Appendable output) { FIELDS_TITLE, "Skeleton", "English Example", - "Native Example (neutral context,
then RTL if relevant)", + "Native Example", false); for (String[] nameAndSkeleton : NAME_AND_PATTERN) { String name = nameAndSkeleton[0]; @@ -524,8 +568,8 @@ public void addTable(DateTimeFormats comparison, Appendable output) { RowStyle.normal, name, skeleton, - comparison.getExample(skeleton), - getExample(skeleton), + comparison.getExample(skeleton, allEscapedCharactersFound), + getExample(skeleton, allEscapedCharactersFound), diff.isPresent(skeleton)); } } @@ -563,12 +607,21 @@ public void addTable(DateTimeFormats comparison, Appendable output) { RowStyle.normal, skeleton, skeleton, - comparison.getExample(skeleton), - getExample(skeleton), + comparison.getExample(skeleton, allEscapedCharactersFound), + getExample(skeleton, allEscapedCharactersFound), true); } } output.append("
"); + if (!allEscapedCharactersFound.isEmpty()) { + output.append("\n

Key to Escaped Characters

\n"); + String keyToEscaped = + CodePointEscaper.getHtmlRows( + allEscapedCharactersFound, + " style='border:1px solid blue; border-collapse: collapse'", + " style='border:1px solid blue'"); + output.append(keyToEscaped); + } } catch (IOException e) { throw new ICUUncheckedIOException(e); } @@ -578,9 +631,10 @@ public void addTable(DateTimeFormats comparison, Appendable output) { * Get an example from the "enhanced" skeleton. * * @param skeleton + * @param escapedCharactersFound Any characters that were escaped are added to this. * @return */ - private String getExample(String skeleton) { + private String getExample(String skeleton, UnicodeSet escapedCharactersFound) { String example; if (skeleton.contains("®")) { example = getRelativeExampleFromSkeleton(skeleton); @@ -615,8 +669,39 @@ private String getExample(String skeleton) { } } String transformedExample = TransliteratorUtilities.toHTML.transform(example); - if (isRTL || BIDI_MARKS.containsSome(transformedExample)) { - transformedExample += exampleSep + rtlStart + transformedExample + rtlEnd; + ArrayList listOfReorderings = new ArrayList<>(); + if ((isRTL || BIDI_MARKS.containsSome(example)) && !example.contains(MISSING_PART)) { + if (!BidiUtils.isOrderingUnchanged( + example, + listOfReorderings, + Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT, + Bidi.DIRECTION_RIGHT_TO_LEFT)) { + // since this locale is RTL, we put it first + String rtlVersion = rtlStart + transformedExample + divEnd; // not colored + String autoVersion = autoLtrStart + transformedExample + divEnd; // colored + String alert = BidiUtils.getAlert(listOfReorderings); + transformedExample = rtlVersion + autoVersion + alert; + } else { + String autoVersion = autoStart + transformedExample + divEnd; // not colored + transformedExample = autoVersion; + } + } + + if (TO_ESCAPE.containsSome(example)) { + StringBuilder processed = new StringBuilder(); + example.codePoints() + .forEach( + x -> { + processed + .append("") + .append( + TransliteratorUtilities.toHTML.transform( + CodePointEscaper.getEscaped(x, TO_ESCAPE))) + .append(""); + }); + + transformedExample += "" + processed + "
"; + escapedCharactersFound.addAll(new UnicodeSet().addAll(example).retainAll(TO_ESCAPE)); } return transformedExample; } @@ -677,7 +762,7 @@ private String getRelativeExampleFromSkeleton(String skeleton) { RelativePattern rp = new RelativePattern(file, skeleton); String value = rp.value; if (value == null) { - value = "ⓜⓘⓢⓢⓘⓝⓖ"; + value = MISSING_PART; } else { DecimalFormat format = icuServiceBuilder.getNumberFormat(0); value = value.replace("{0}", format.format(Math.abs(rp.offset)).replace("'", "''")); @@ -987,12 +1072,14 @@ public static void main(String[] args) throws IOException { String organization = MyOptions.organization.option.getValue(); String filter = MyOptions.filter.option.getValue(); + boolean hasFilter = MyOptions.filter.option.doesOccur(); - Factory englishFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, filter); - CLDRFile englishFile = englishFactory.make("en", true); + CLDRFile englishFile = CONFIG.getEnglish(); - Factory factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, LOCALES); - System.out.println("Total locales: " + factory.getAvailableLanguages().size()); + Factory factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, filter); + final Set availableLocales = + hasFilter ? factory.getAvailable() : factory.getAvailableLanguages(); + System.out.println("Total locales: " + availableLocales.size()); DateTimeFormats english = new DateTimeFormats().set(englishFile, "gregorian"); new File(DIR).mkdirs(); @@ -1004,7 +1091,7 @@ public static void main(String[] args) throws IOException { Map sorted = new TreeMap<>(); SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); Set defaultContent = sdi.getDefaultContentLocales(); - for (String localeID : factory.getAvailableLanguages()) { + for (String localeID : availableLocales) { Level level = StandardCodes.make().getLocaleCoverageLevel(organization, localeID); if (Level.MODERN.compareTo(level) > 0) { continue; @@ -1047,7 +1134,7 @@ public static void main(String[] args) throws IOException { + name + "" + "

Index

\n" - + "

The following chart shows typical usage of date and time formatting with the Gregorian calendar. " + + "

The following chart shows typical usage of date and time formatting with the Gregorian calendar and default number system. " + "There is important information on Date/Time Review, " + "so please read that page before starting!

\n"); formats.addTable(english, out);