Skip to content

Commit

Permalink
CLDR-17063 CLDRModify -fQ debugging, real/fake keyword paths
Browse files Browse the repository at this point in the history
-This illustrates cause of draft=unconfirmed bug

-The old, fake keyword path is derived from tts path by removeAttribute

-The new, real keyword path gotten from the CLDRFile

-In general, they are not the same, common difference is draft unconfirmed
  • Loading branch information
btangmu committed Sep 18, 2023
1 parent 61b74a3 commit 7f3d2e9
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.TreeMultimap;
import com.google.myanmartools.ZawgyiDetector;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.Collator;
Expand Down Expand Up @@ -71,15 +72,6 @@ public class DisplayAndInputProcessor {
public static final UnicodeSet RTL =
new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]").freeze();

public static final UnicodeSet TO_QUOTE =
new UnicodeSet(
"[[:Cn:]"
+ "[:Default_Ignorable_Code_Point:]"
+ "[:patternwhitespace:]"
+ "[:Me:][:Mn:]]" // add non-spacing marks
)
.freeze();

public static final Pattern NUMBER_SEPARATOR_PATTERN =
Pattern.compile("//ldml/numbers/symbols.*/(decimal|group)");

Expand Down Expand Up @@ -728,6 +720,28 @@ public static void filterCoveredKeywords(TreeSet<String> sorted) {
sorted.removeAll(toRemove);
}

/**
* Given a sorted list like "BEAR | Bear | PANDA | Panda | panda",filter out any items that
* duplicate other items aside from case, leaving only, for example, "BEAR | PANDA"
*
* @param sorted the set from which items may be removed
*/
public static void filterKeywordsDifferingOnlyInCase(TreeSet<String> sorted) {
TreeMultimap<String, String> mapFromLower = TreeMultimap.create();
for (String item : sorted) {
mapFromLower.put(item.toLowerCase(), item);
}
TreeSet<String> toRetain = new TreeSet<>();
for (String lower : mapFromLower.keySet()) {
Set<String> variants = mapFromLower.get(lower);
for (String var : variants) {
toRetain.add(var);
break;
}
}
sorted.retainAll(toRetain);
}

private String displayUnicodeSet(String value) {
return pp.format(
new UnicodeSet(value)); // will throw exception if bad format, eg missing [...]
Expand Down Expand Up @@ -1040,10 +1054,6 @@ public static String fixAdlamNasalization(String fromString) {
.replaceAll("$1" + ADLAM_NASALIZATION + "$2"); // replace quote with 𞥋
}

static Pattern NEEDS_QUOTE1 = PatternCache.get("(\\s|$)([-\\}\\]\\&])()");
static Pattern NEEDS_QUOTE2 =
PatternCache.get("([^\\\\])([\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s)

public String getCleanedUnicodeSet(UnicodeSet exemplar, ExemplarType exemplarType) {

if (rawFormatter == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2091,6 +2091,9 @@ public void handlePath(String xpath) {
TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT));
CLDRFile resolved;

Set<String> fakeKeywordPaths = new TreeSet<>();
Set<String> realKeywordPaths = new TreeSet<>();

@Override
public void handleStart() {
String localeID = cldrFileToFilter.getLocaleID();
Expand All @@ -2114,12 +2117,14 @@ public void handlePath(String xpath) {
XPathParts parts = XPathParts.getFrozenInstance(fullpath);
String type = parts.getAttributeValue(2, "type");
if (type == null) {
realKeywordPaths.add(xpath);
return; // no TTS, so keywords, skip
}

XPathParts keywordParts = parts.cloneAsThawed().removeAttribute(2, "type");
String keywordPath =
CLDRFile.getDistinguishingXPath(keywordParts.toString(), null);
String keywordPath = keywordParts.toString();
fakeKeywordPaths.add(keywordPath);
String distinguishingKeywordPath =
CLDRFile.getDistinguishingXPath(keywordPath, null);
String rawKeywordValue = cldrFileToFilter.getStringValue(keywordPath);

// skip if keywords AND name are inherited
Expand All @@ -2140,7 +2145,7 @@ public void handlePath(String xpath) {

String name = resolved.getStringValue(xpath);
String keywordValue = resolved.getStringValue(keywordPath);
String sourceLocaleId = resolved.getSourceLocaleID(keywordPath, null);
String sourceLocaleId = resolved.getSourceLocaleID(distinguishingKeywordPath, null);
sorted.clear();
sorted.add(name);

Expand All @@ -2151,14 +2156,30 @@ public void handlePath(String xpath) {
sorted.addAll(items);
}
DisplayAndInputProcessor.filterCoveredKeywords(sorted);
// TODO: Also filter items that are duplicates except for case
// Reference: https://unicode-org.atlassian.net/browse/CLDR-16972
// DisplayAndInputProcessor.filterKeywordsDifferingOnlyInCase(sorted);
DisplayAndInputProcessor.filterKeywordsDifferingOnlyInCase(sorted);
String newKeywordValue = Joiner.on(" | ").join(sorted);
if (!newKeywordValue.equals(keywordValue)) {
replace(keywordPath, keywordPath, newKeywordValue);
}
}

@Override
public void handleEnd() {
if (fakeKeywordPaths.isEmpty() || realKeywordPaths.isEmpty()) {
throw new RuntimeException("fake/real EMPTY loc: " + cldrFileToFilter.getLocaleID());
}
if (!fakeKeywordPaths.equals(realKeywordPaths)) {
fakeKeywordPaths.removeAll(realKeywordPaths);
realKeywordPaths.removeAll(fakeKeywordPaths);
for (String p : fakeKeywordPaths) {
System.out.println("ONLY fake: " + p + " loc: " + cldrFileToFilter.getLocaleID());
}
for (String p : realKeywordPaths) {
System.out.println("ONLY real: " + p + " loc: " + cldrFileToFilter.getLocaleID());
}
// throw new RuntimeException("fake/real diff loc: " + cldrFileToFilter.getLocaleID());
}
}
});

fixList.add(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import com.ibm.icu.lang.CharSequences;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import java.util.Arrays;
import java.util.Set;
import java.util.TreeSet;
import org.unicode.cldr.test.DisplayAndInputProcessor;
Expand Down Expand Up @@ -817,4 +818,49 @@ public void TestFilterCoveredKeywords() {
}
}
}

private class KeywordCaseTestData {
String[] array, expectedArray;

KeywordCaseTestData(String[] array, String[] expectedArray) {
this.array = array;
this.expectedArray = expectedArray;
}

boolean filtersAsExpected() {
TreeSet<String> set = new TreeSet<>(Arrays.asList(array));
TreeSet<String> expectedSet = new TreeSet<>(Arrays.asList(expectedArray));
DisplayAndInputProcessor.filterKeywordsDifferingOnlyInCase(set);
if (set.equals(expectedSet)) {
return true;
} else {
errln("Resulting set " + set + " differs from expected set " + expectedSet);
return false;
}
}
}

public void TestFilterKeywordsDifferingOnlyInCase() {
String[] array = new String[] {"BEAR", "Bear", "PANDA", "Panda", "panda"};
String[] expectedArray = new String[] {"BEAR", "PANDA"};
KeywordCaseTestData dat = new KeywordCaseTestData(array, expectedArray);
if (!dat.filtersAsExpected()) {
errln("Resulting set differs from expected set 1");
}
array =
new String[] {
"gebou", "Japannees", "japanse poskantoor", "Japanse poskantoor", "pos"
};
expectedArray = new String[] {"gebou", "Japannees", "Japanse poskantoor", "pos"};
dat = new KeywordCaseTestData(array, expectedArray);
if (!dat.filtersAsExpected()) {
errln("Resulting set differs from expected set 2");
}
array = new String[] {"Aa", "Bb", "Cc", "Dd", "行"}; // should not change
expectedArray = array;
dat = new KeywordCaseTestData(array, expectedArray);
if (!dat.filtersAsExpected()) {
errln("Resulting set differs from expected set 3");
}
}
}

0 comments on commit 7f3d2e9

Please sign in to comment.