CLDR-17063 CLDRModify -fQ debugging, real/fake keyword paths

-This illustrates cause of draft=unconfirmed bug -The old, fake keyword path is derived from tts path by removeAttribute -The new, real keyword path gotten from the CLDRFile -In general, they are not the same, common difference is draft unconfirmed
unicode-org · btangmu · Sep 18, 2023 · Sep 18, 2023 · Sep 18, 2023 · 7f3d2e9effc340ac675d74c0193ea599d7c66fa0
commit 7f3d2e9effc340ac675d74c0193ea599d7c66fa0
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java
@@ -5,6 +5,7 @@
 
 import com.google.common.base.Joiner;
 import com.google.common.base.Splitter;
+import com.google.common.collect.TreeMultimap;
 import com.google.myanmartools.ZawgyiDetector;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.Collator;
@@ -71,15 +72,6 @@ public class DisplayAndInputProcessor {
     public static final UnicodeSet RTL =
             new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]").freeze();
 
-    public static final UnicodeSet TO_QUOTE =
-            new UnicodeSet(
-                            "[[:Cn:]"
-                                    + "[:Default_Ignorable_Code_Point:]"
-                                    + "[:patternwhitespace:]"
-                                    + "[:Me:][:Mn:]]" // add non-spacing marks
-                            )
-                    .freeze();
-
     public static final Pattern NUMBER_SEPARATOR_PATTERN =
             Pattern.compile("//ldml/numbers/symbols.*/(decimal|group)");
 
@@ -728,6 +720,28 @@ public static void filterCoveredKeywords(TreeSet<String> sorted) {
         sorted.removeAll(toRemove);
     }
 
+    /**
+     * Given a sorted list like "BEAR | Bear ｜ PANDA | Panda | panda"，filter out any items that
+     * duplicate other items aside from case, leaving only, for example, "BEAR | PANDA"
+     *
+     * @param sorted the set from which items may be removed
+     */
+    public static void filterKeywordsDifferingOnlyInCase(TreeSet<String> sorted) {
+        TreeMultimap<String, String> mapFromLower = TreeMultimap.create();
+        for (String item : sorted) {
+            mapFromLower.put(item.toLowerCase(), item);
+        }
+        TreeSet<String> toRetain = new TreeSet<>();
+        for (String lower : mapFromLower.keySet()) {
+            Set<String> variants = mapFromLower.get(lower);
+            for (String var : variants) {
+                toRetain.add(var);
+                break;
+            }
+        }
+        sorted.retainAll(toRetain);
+    }
+
     private String displayUnicodeSet(String value) {
         return pp.format(
                 new UnicodeSet(value)); // will throw exception if bad format, eg missing [...]
@@ -1040,10 +1054,6 @@ public static String fixAdlamNasalization(String fromString) {
                 .replaceAll("$1" + ADLAM_NASALIZATION + "$2"); // replace quote with 𞥋
     }
 
-    static Pattern NEEDS_QUOTE1 = PatternCache.get("(\\s|$)([-\\}\\]\\&])()");
-    static Pattern NEEDS_QUOTE2 =
-            PatternCache.get("([^\\\\])([\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s)
-
     public String getCleanedUnicodeSet(UnicodeSet exemplar, ExemplarType exemplarType) {
 
         if (rawFormatter == null) {

diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java
@@ -2091,6 +2091,9 @@ public void handlePath(String xpath) {
                     TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT));
                     CLDRFile resolved;
 
+                    Set<String> fakeKeywordPaths = new TreeSet<>();
+                    Set<String> realKeywordPaths = new TreeSet<>();
+
                     @Override
                     public void handleStart() {
                         String localeID = cldrFileToFilter.getLocaleID();
@@ -2114,12 +2117,14 @@ public void handlePath(String xpath) {
                         XPathParts parts = XPathParts.getFrozenInstance(fullpath);
                         String type = parts.getAttributeValue(2, "type");
                         if (type == null) {
+                            realKeywordPaths.add(xpath);
                             return; // no TTS, so keywords, skip
                         }
-
                         XPathParts keywordParts = parts.cloneAsThawed().removeAttribute(2, "type");
-                        String keywordPath =
-                                CLDRFile.getDistinguishingXPath(keywordParts.toString(), null);
+                        String keywordPath = keywordParts.toString();
+                        fakeKeywordPaths.add(keywordPath);
+                        String distinguishingKeywordPath =
+                                CLDRFile.getDistinguishingXPath(keywordPath, null);
                         String rawKeywordValue = cldrFileToFilter.getStringValue(keywordPath);
 
                         // skip if keywords AND name are inherited
@@ -2140,7 +2145,7 @@ public void handlePath(String xpath) {
 
                         String name = resolved.getStringValue(xpath);
                         String keywordValue = resolved.getStringValue(keywordPath);
-                        String sourceLocaleId = resolved.getSourceLocaleID(keywordPath, null);
+                        String sourceLocaleId = resolved.getSourceLocaleID(distinguishingKeywordPath, null);
                         sorted.clear();
                         sorted.add(name);
 
@@ -2151,14 +2156,30 @@ public void handlePath(String xpath) {
                             sorted.addAll(items);
                         }
                         DisplayAndInputProcessor.filterCoveredKeywords(sorted);
-                        // TODO: Also filter items that are duplicates except for case
-                        // Reference: https://unicode-org.atlassian.net/browse/CLDR-16972
-                        // DisplayAndInputProcessor.filterKeywordsDifferingOnlyInCase(sorted);
+                        DisplayAndInputProcessor.filterKeywordsDifferingOnlyInCase(sorted);
                         String newKeywordValue = Joiner.on(" | ").join(sorted);
                         if (!newKeywordValue.equals(keywordValue)) {
                             replace(keywordPath, keywordPath, newKeywordValue);
                         }
                     }
+
+                    @Override
+                    public void handleEnd() {
+                        if (fakeKeywordPaths.isEmpty() || realKeywordPaths.isEmpty()) {
+                            throw new RuntimeException("fake/real EMPTY loc: " + cldrFileToFilter.getLocaleID());
+                        }
+                        if (!fakeKeywordPaths.equals(realKeywordPaths)) {
+                            fakeKeywordPaths.removeAll(realKeywordPaths);
+                            realKeywordPaths.removeAll(fakeKeywordPaths);
+                            for (String p : fakeKeywordPaths) {
+                                System.out.println("ONLY fake: " + p + " loc: " + cldrFileToFilter.getLocaleID());
+                            }
+                            for (String p : realKeywordPaths) {
+                                System.out.println("ONLY real: " + p + " loc: " + cldrFileToFilter.getLocaleID());
+                            }
+                            // throw new RuntimeException("fake/real diff  loc: " + cldrFileToFilter.getLocaleID());
+                        }
+                    }
                 });
 
         fixList.add(

diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java
@@ -4,6 +4,7 @@
 import com.ibm.icu.lang.CharSequences;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UnicodeSetIterator;
+import java.util.Arrays;
 import java.util.Set;
 import java.util.TreeSet;
 import org.unicode.cldr.test.DisplayAndInputProcessor;
@@ -817,4 +818,49 @@ public void TestFilterCoveredKeywords() {
             }
         }
     }
+
+    private class KeywordCaseTestData {
+        String[] array, expectedArray;
+
+        KeywordCaseTestData(String[] array, String[] expectedArray) {
+            this.array = array;
+            this.expectedArray = expectedArray;
+        }
+
+        boolean filtersAsExpected() {
+            TreeSet<String> set = new TreeSet<>(Arrays.asList(array));
+            TreeSet<String> expectedSet = new TreeSet<>(Arrays.asList(expectedArray));
+            DisplayAndInputProcessor.filterKeywordsDifferingOnlyInCase(set);
+            if (set.equals(expectedSet)) {
+                return true;
+            } else {
+                errln("Resulting set " + set + " differs from expected set " + expectedSet);
+                return false;
+            }
+        }
+    }
+
+    public void TestFilterKeywordsDifferingOnlyInCase() {
+        String[] array = new String[] {"BEAR", "Bear", "PANDA", "Panda", "panda"};
+        String[] expectedArray = new String[] {"BEAR", "PANDA"};
+        KeywordCaseTestData dat = new KeywordCaseTestData(array, expectedArray);
+        if (!dat.filtersAsExpected()) {
+            errln("Resulting set differs from expected set 1");
+        }
+        array =
+                new String[] {
+                    "gebou", "Japannees", "japanse poskantoor", "Japanse poskantoor", "pos"
+                };
+        expectedArray = new String[] {"gebou", "Japannees", "Japanse poskantoor", "pos"};
+        dat = new KeywordCaseTestData(array, expectedArray);
+        if (!dat.filtersAsExpected()) {
+            errln("Resulting set differs from expected set 2");
+        }
+        array = new String[] {"Aa", "Bb", "Cc", "Dd", "行"}; // should not change
+        expectedArray = array;
+        dat = new KeywordCaseTestData(array, expectedArray);
+        if (!dat.filtersAsExpected()) {
+            errln("Resulting set differs from expected set 3");
+        }
+    }
 }