unicode-org · macchiati · Aug 2, 2024 · Jul 31, 2024 · Jul 31, 2024 · Aug 1, 2024
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/BidiUtils.java
@@ -0,0 +1,162 @@
+package org.unicode.cldr.util;
+
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Sets;
+import com.google.common.collect.Sets.SetView;
+import com.ibm.icu.text.Bidi;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSet.SpanCondition;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+
+/**
+ * A set of utilities for handling BIDI, especially in charts and examples but not restricted to
+ * that.
+ */
+public class BidiUtils {
+    public static final String ALERT = "⚠️";
+    static final String LRM = CodePointEscaper.LRM.getString();
+
+    // These are intended to be classes of characters that "stick together in order"
+    // The initial focus is dates, so this will probably need to be expanded for numbers; might need
+    // more syntax
+
+    private enum SpanClass {
+        NUMBERS("\\p{N}"),
+        LETTERS_MARKS("[\\p{L}\\p{M}]"),
+        DATE_PUNCT("[+]"),
+        SPACES("\\p{Z}"),
+        OTHERS("\\p{any}") // must be last, to pick up remainder.
+    ;
+        final UnicodeSet uset;
+
+        private SpanClass(String unicodeSetSource) {
+            uset = new UnicodeSet(unicodeSetSource);
+        }
+
+        static {
+            // clean up by removing previous values
+            UnicodeSet soFar = new UnicodeSet();
+            for (SpanClass sc : SpanClass.values()) {
+                sc.uset.removeAll(soFar).freeze();
+                soFar.addAll(sc.uset);
+            }
+        }
+    }
+    /**
+     * Checks the ordering of the example, under the specified bidiDirectionOptions;
+     *
+     * @param example Source text, not HTMLified
+     * @param outputReorderedResults One string for each specified bidiDirectionOption
+     * @param bidiDirectionOptions an array of BIDI directions from com.ibm.icu.text.Bidi. if there
+     *     are no items, the default is DIRECTION_DEFAULT_LEFT_TO_RIGHT (dir="auto"),
+     *     DIRECTION_RIGHT_TO_LEFT (dir="rtl").
+     * @return true unless two or more of the resulting strings are different.
+     */
+    public static boolean isOrderingUnchanged(
+            String example, List<String> outputReorderedResults, int... bidiDirectionOptions) {
+        boolean hasList = outputReorderedResults != null;
+        if (!hasList) {
+            outputReorderedResults = new ArrayList<>();
+        } else {
+            outputReorderedResults.clear();
+        }
+        boolean result = true;
+        for (int count = 0; count < bidiDirectionOptions.length; ++count) {
+            String reordered = new Bidi(example, bidiDirectionOptions[count]).writeReordered(0);
+            outputReorderedResults.add(reordered);
+            if (result && count != 0 && !reordered.equals(outputReorderedResults.get(0))) {
+                result = false;
+                if (!hasList) {
+                    break; // if the output results are not needed, then stop.
+                }
+            }
+        }
+        return result;
+    }
+
+    /**
+     * Return a list of the , where each span is a sequence of:
+     *
+     * @param orderedLTR
+     * @return
+     */
+    /**
+     * Gets the 'fields' in a formatted string, used to test whether bidi reordering causes the
+     * original fields to merge when reordered. Each field is the longest contiguous span of
+     * characters with the same properties: *
+     *
+     * <ul>
+     *   <li>numbers (\p{N})
+     *   <li>letters & marks ([\p{L}\p{M}
+     *   <li>Other
+     * </ul>
+     *
+     * @param ordered
+     * @return a set of fields, in the same order as found in the text but duplicates removed (ike
+     *     LinkedHashSeet).
+     */
+    public static Set<String> getFields(String reordred, Set<String> result) {
+        int start = 0;
+        while (start < reordred.length()) {
+            for (SpanClass sc : SpanClass.values()) {
+                int end = sc.uset.span(reordred, start, SpanCondition.CONTAINED);
+                if (end != start) {
+                    result.add(reordred.substring(start, end));
+                    start = end;
+                    break;
+                }
+            }
+        }
+        return ImmutableSet.copyOf(result);
+    }
+
+    /**
+     * Show when the fields in strings are different
+     *
+     * @param bidiReordereds
+     * @return
+     */
+    public static String getAlert(List<String> bidiReordereds) {
+        Set<Set<String>> results = new LinkedHashSet<>();
+        for (String bidiReordered : bidiReordereds) {
+            Set<String> fieldsLTR = BidiUtils.getFields(bidiReordered, new TreeSet<>());
+            results.add(fieldsLTR);
+        }
+        if (results.size() < 2) {
+            return "";
+        }
+        // there can still be differences within a field of OTHERS, that we  ignore.
+        // EG ⚠️ 20,28,2B; 2B,28,20 " (+" vs " (+"
+
+        // show just the difference in the first 2, for now.
+        Iterator<Set<String>> it = results.iterator();
+        Set<String> first = it.next();
+        Set<String> second = it.next();
+        SetView<String> uniqueFirst = Sets.difference(first, second);
+        SetView<String> uniqueSecond = Sets.difference(second, first);
+        return ALERT + " " + escape(uniqueFirst) + "; " + escape(uniqueSecond);
+    }
+
+    public static String escape(Set<String> uniqueFirst) {
+        return uniqueFirst.stream()
+                .map(x -> CodePointEscaper.toEscaped(x))
+                .collect(Collectors.joining(LRM + ", " + LRM, LRM, LRM));
+    }
+
+    public static String alphagram(String string) {
+        return string.codePoints()
+                .sorted()
+                .collect(
+                        StringBuilder::new, // Supplier<R> supplier
+                        StringBuilder::appendCodePoint, // ObjIntConsumer<R> accumulator
+                        StringBuilder::append // BiConsumer<R,R> combiner
+                        )
+                .toString();
+    }
+}
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java
@@ -1,7 +1,9 @@
 package org.unicode.cldr.util;
 
 import com.ibm.icu.impl.UnicodeMap;
+import com.ibm.icu.impl.Utility;
 import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;
 import java.util.Locale;
 
@@ -19,10 +21,13 @@ public enum CodePointEscaper {
     LF(0xA, "line feed"),
     CR(0xD, "carriage return"),
     SP(0x20, "space", "ASCII space"),
-    NSP(0x2009, "narrow/thin space", "Also known as ‘thin space’"),
+    TSP(0x2009, "thin space", "Aka ‘narrow space’"),
     NBSP(0xA0, "no-break space", "Same as space, but doesn’t line wrap."),
 
-    NNBSP(0x202F, "narrow/thin no-break space", "Same as narrow space, but doesn’t line wrap."),
+    NBTSP(
+            0x202F,
+            "no-break thin space",
+            "Same as thin space, but doesn’t line wrap. Aka 'narrow no-break space'"),
 
     WNJ(
             0x200B,
@@ -110,9 +115,7 @@ public enum CodePointEscaper {
     private final String description;
 
     private CodePointEscaper(int codePoint, String shortName) {
-        this.codePoint = codePoint;
-        this.shortName = shortName;
-        this.description = "";
+        this(codePoint, shortName, "");
     }
 
     private CodePointEscaper(int codePoint, String shortName, String description) {
@@ -147,6 +150,11 @@ public int getCodePoint() {
         return codePoint;
     }
 
+    /** Return the string form of the code point for this character. */
+    public String getString() {
+        return UTF16.valueOf(codePoint);
+    }
+
     /** Returns the escaped form from the code point for this enum */
     public String codePointToEscaped() {
         return ESCAPE_START + rawCodePointToEscaped(codePoint) + ESCAPE_END;
@@ -196,6 +204,15 @@ public static String toEscaped(String unescaped, UnicodeSet toEscape) {
                         });
         return result.toString();
     }
+
+    public static String getEscaped(int cp, UnicodeSet toEscape) {
+        if (!toEscape.contains(cp)) {
+            return UTF16.valueOf(cp);
+        } else {
+            return codePointToEscaped(cp);
+        }
+    }
+
     /** Return unescaped string */
     public static String toUnescaped(String escaped) {
         if (escaped == null) {
@@ -273,4 +290,54 @@ public static String rawCodePointToEscaped(int codePoint) {
                 ? Integer.toString(codePoint, 16).toUpperCase(Locale.ROOT)
                 : result.toString();
     }
+
+    public static final String getHtmlRows(
+            UnicodeSet escapesToShow, String tableOptions, String cellOptions) {
+        if (!escapesToShow.strings().isEmpty()) {
+            throw new IllegalArgumentException("No strings allowed in the unicode set.");
+        }
+        StringBuilder result = new StringBuilder("<table" + tableOptions + ">");
+        UnicodeSet remaining = new UnicodeSet(escapesToShow);
+        String tdPlus = "<td" + cellOptions + ">";
+        for (CodePointEscaper cpe : CodePointEscaper.values()) {
+            int cp = cpe.getCodePoint();
+            remaining.remove(cp);
+            if (escapesToShow.contains(cpe.getCodePoint())) {
+                final String id = cpe.name();
+                final String shortName = cpe.getShortName();
+                final String description = cpe.getDescription();
+                addREsult(result, tdPlus, id, shortName, description);
+            }
+        }
+        for (String cps : remaining) {
+            int cp = cps.codePointAt(0);
+            final String extendedName = UCharacter.getExtendedName(cp);
+            addREsult(
+                    result,
+                    tdPlus,
+                    Utility.hex(cp, 2),
+                    "",
+                    extendedName == null ? "" : extendedName.toLowerCase());
+        }
+        return result.append("</table>").toString();
+    }
+
+    public static void addREsult(
+            StringBuilder result,
+            String tdPlus,
+            final String id,
+            final String shortName,
+            final String description) {
+        result.append("<tr>")
+                .append(tdPlus)
+                .append(ESCAPE_START)
+                .append(id)
+                .append(ESCAPE_END + "</td>")
+                .append(tdPlus)
+                .append(shortName)
+                .append("</td>")
+                .append(tdPlus)
+                .append(description)
+                .append("</td><tr>");
+    }
 }