From 7f3d2e9effc340ac675d74c0193ea599d7c66fa0 Mon Sep 17 00:00:00 2001 From: btangmu Date: Mon, 18 Sep 2023 11:12:17 -0400 Subject: [PATCH] CLDR-17063 CLDRModify -fQ debugging, real/fake keyword paths -This illustrates cause of draft=unconfirmed bug -The old, fake keyword path is derived from tts path by removeAttribute -The new, real keyword path gotten from the CLDRFile -In general, they are not the same, common difference is draft unconfirmed --- .../cldr/test/DisplayAndInputProcessor.java | 36 +++++++++------ .../org/unicode/cldr/tool/CLDRModify.java | 35 +++++++++++--- .../TestDisplayAndInputProcessor.java | 46 +++++++++++++++++++ 3 files changed, 97 insertions(+), 20 deletions(-) diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java index 7def78fb627..3145f05cd66 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java @@ -5,6 +5,7 @@ import com.google.common.base.Joiner; import com.google.common.base.Splitter; +import com.google.common.collect.TreeMultimap; import com.google.myanmartools.ZawgyiDetector; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.Collator; @@ -71,15 +72,6 @@ public class DisplayAndInputProcessor { public static final UnicodeSet RTL = new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]").freeze(); - public static final UnicodeSet TO_QUOTE = - new UnicodeSet( - "[[:Cn:]" - + "[:Default_Ignorable_Code_Point:]" - + "[:patternwhitespace:]" - + "[:Me:][:Mn:]]" // add non-spacing marks - ) - .freeze(); - public static final Pattern NUMBER_SEPARATOR_PATTERN = Pattern.compile("//ldml/numbers/symbols.*/(decimal|group)"); @@ -728,6 +720,28 @@ public static void filterCoveredKeywords(TreeSet sorted) { sorted.removeAll(toRemove); } + /** + * Given a sorted list like "BEAR | Bear | PANDA | Panda | panda",filter out any items that + * duplicate other items aside from case, leaving only, for example, "BEAR | PANDA" + * + * @param sorted the set from which items may be removed + */ + public static void filterKeywordsDifferingOnlyInCase(TreeSet sorted) { + TreeMultimap mapFromLower = TreeMultimap.create(); + for (String item : sorted) { + mapFromLower.put(item.toLowerCase(), item); + } + TreeSet toRetain = new TreeSet<>(); + for (String lower : mapFromLower.keySet()) { + Set variants = mapFromLower.get(lower); + for (String var : variants) { + toRetain.add(var); + break; + } + } + sorted.retainAll(toRetain); + } + private String displayUnicodeSet(String value) { return pp.format( new UnicodeSet(value)); // will throw exception if bad format, eg missing [...] @@ -1040,10 +1054,6 @@ public static String fixAdlamNasalization(String fromString) { .replaceAll("$1" + ADLAM_NASALIZATION + "$2"); // replace quote with 𞥋 } - static Pattern NEEDS_QUOTE1 = PatternCache.get("(\\s|$)([-\\}\\]\\&])()"); - static Pattern NEEDS_QUOTE2 = - PatternCache.get("([^\\\\])([\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s) - public String getCleanedUnicodeSet(UnicodeSet exemplar, ExemplarType exemplarType) { if (rawFormatter == null) { diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java index b096b13f04c..7b08e763b39 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRModify.java @@ -2091,6 +2091,9 @@ public void handlePath(String xpath) { TreeSet sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT)); CLDRFile resolved; + Set fakeKeywordPaths = new TreeSet<>(); + Set realKeywordPaths = new TreeSet<>(); + @Override public void handleStart() { String localeID = cldrFileToFilter.getLocaleID(); @@ -2114,12 +2117,14 @@ public void handlePath(String xpath) { XPathParts parts = XPathParts.getFrozenInstance(fullpath); String type = parts.getAttributeValue(2, "type"); if (type == null) { + realKeywordPaths.add(xpath); return; // no TTS, so keywords, skip } - XPathParts keywordParts = parts.cloneAsThawed().removeAttribute(2, "type"); - String keywordPath = - CLDRFile.getDistinguishingXPath(keywordParts.toString(), null); + String keywordPath = keywordParts.toString(); + fakeKeywordPaths.add(keywordPath); + String distinguishingKeywordPath = + CLDRFile.getDistinguishingXPath(keywordPath, null); String rawKeywordValue = cldrFileToFilter.getStringValue(keywordPath); // skip if keywords AND name are inherited @@ -2140,7 +2145,7 @@ public void handlePath(String xpath) { String name = resolved.getStringValue(xpath); String keywordValue = resolved.getStringValue(keywordPath); - String sourceLocaleId = resolved.getSourceLocaleID(keywordPath, null); + String sourceLocaleId = resolved.getSourceLocaleID(distinguishingKeywordPath, null); sorted.clear(); sorted.add(name); @@ -2151,14 +2156,30 @@ public void handlePath(String xpath) { sorted.addAll(items); } DisplayAndInputProcessor.filterCoveredKeywords(sorted); - // TODO: Also filter items that are duplicates except for case - // Reference: https://unicode-org.atlassian.net/browse/CLDR-16972 - // DisplayAndInputProcessor.filterKeywordsDifferingOnlyInCase(sorted); + DisplayAndInputProcessor.filterKeywordsDifferingOnlyInCase(sorted); String newKeywordValue = Joiner.on(" | ").join(sorted); if (!newKeywordValue.equals(keywordValue)) { replace(keywordPath, keywordPath, newKeywordValue); } } + + @Override + public void handleEnd() { + if (fakeKeywordPaths.isEmpty() || realKeywordPaths.isEmpty()) { + throw new RuntimeException("fake/real EMPTY loc: " + cldrFileToFilter.getLocaleID()); + } + if (!fakeKeywordPaths.equals(realKeywordPaths)) { + fakeKeywordPaths.removeAll(realKeywordPaths); + realKeywordPaths.removeAll(fakeKeywordPaths); + for (String p : fakeKeywordPaths) { + System.out.println("ONLY fake: " + p + " loc: " + cldrFileToFilter.getLocaleID()); + } + for (String p : realKeywordPaths) { + System.out.println("ONLY real: " + p + " loc: " + cldrFileToFilter.getLocaleID()); + } + // throw new RuntimeException("fake/real diff loc: " + cldrFileToFilter.getLocaleID()); + } + } }); fixList.add( diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java index 8d1359e231b..625da1d47a3 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java @@ -4,6 +4,7 @@ import com.ibm.icu.lang.CharSequences; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; +import java.util.Arrays; import java.util.Set; import java.util.TreeSet; import org.unicode.cldr.test.DisplayAndInputProcessor; @@ -817,4 +818,49 @@ public void TestFilterCoveredKeywords() { } } } + + private class KeywordCaseTestData { + String[] array, expectedArray; + + KeywordCaseTestData(String[] array, String[] expectedArray) { + this.array = array; + this.expectedArray = expectedArray; + } + + boolean filtersAsExpected() { + TreeSet set = new TreeSet<>(Arrays.asList(array)); + TreeSet expectedSet = new TreeSet<>(Arrays.asList(expectedArray)); + DisplayAndInputProcessor.filterKeywordsDifferingOnlyInCase(set); + if (set.equals(expectedSet)) { + return true; + } else { + errln("Resulting set " + set + " differs from expected set " + expectedSet); + return false; + } + } + } + + public void TestFilterKeywordsDifferingOnlyInCase() { + String[] array = new String[] {"BEAR", "Bear", "PANDA", "Panda", "panda"}; + String[] expectedArray = new String[] {"BEAR", "PANDA"}; + KeywordCaseTestData dat = new KeywordCaseTestData(array, expectedArray); + if (!dat.filtersAsExpected()) { + errln("Resulting set differs from expected set 1"); + } + array = + new String[] { + "gebou", "Japannees", "japanse poskantoor", "Japanse poskantoor", "pos" + }; + expectedArray = new String[] {"gebou", "Japannees", "Japanse poskantoor", "pos"}; + dat = new KeywordCaseTestData(array, expectedArray); + if (!dat.filtersAsExpected()) { + errln("Resulting set differs from expected set 2"); + } + array = new String[] {"Aa", "Bb", "Cc", "Dd", "行"}; // should not change + expectedArray = array; + dat = new KeywordCaseTestData(array, expectedArray); + if (!dat.filtersAsExpected()) { + errln("Resulting set differs from expected set 3"); + } + } }