Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLDR-17063 CLDRModify -fQ debugging, real/fake keyword paths #3272

Closed
wants to merge 2 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
CLDR-17063 CLDRModify -fQ debugging, real/fake keyword paths
-This illustrates cause of draft=unconfirmed bug

-The old, fake keyword path is derived from tts path by removeAttribute

-The new, real keyword path gotten from the CLDRFile

-In general, they are not the same, common difference is draft unconfirmed
  • Loading branch information
btangmu committed Sep 18, 2023
commit 7f3d2e9effc340ac675d74c0193ea599d7c66fa0
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@

import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.TreeMultimap;
import com.google.myanmartools.ZawgyiDetector;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.Collator;
@@ -71,15 +72,6 @@ public class DisplayAndInputProcessor {
public static final UnicodeSet RTL =
new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]").freeze();

public static final UnicodeSet TO_QUOTE =
new UnicodeSet(
"[[:Cn:]"
+ "[:Default_Ignorable_Code_Point:]"
+ "[:patternwhitespace:]"
+ "[:Me:][:Mn:]]" // add non-spacing marks
)
.freeze();

public static final Pattern NUMBER_SEPARATOR_PATTERN =
Pattern.compile("//ldml/numbers/symbols.*/(decimal|group)");

@@ -728,6 +720,28 @@ public static void filterCoveredKeywords(TreeSet<String> sorted) {
sorted.removeAll(toRemove);
}

/**
* Given a sorted list like "BEAR | Bear | PANDA | Panda | panda",filter out any items that
* duplicate other items aside from case, leaving only, for example, "BEAR | PANDA"
*
* @param sorted the set from which items may be removed
*/
public static void filterKeywordsDifferingOnlyInCase(TreeSet<String> sorted) {
TreeMultimap<String, String> mapFromLower = TreeMultimap.create();
for (String item : sorted) {
mapFromLower.put(item.toLowerCase(), item);
}
TreeSet<String> toRetain = new TreeSet<>();
for (String lower : mapFromLower.keySet()) {
Set<String> variants = mapFromLower.get(lower);
for (String var : variants) {
toRetain.add(var);
break;
}
}
sorted.retainAll(toRetain);
}

private String displayUnicodeSet(String value) {
return pp.format(
new UnicodeSet(value)); // will throw exception if bad format, eg missing [...]
@@ -1040,10 +1054,6 @@ public static String fixAdlamNasalization(String fromString) {
.replaceAll("$1" + ADLAM_NASALIZATION + "$2"); // replace quote with 𞥋
}

static Pattern NEEDS_QUOTE1 = PatternCache.get("(\\s|$)([-\\}\\]\\&])()");
static Pattern NEEDS_QUOTE2 =
PatternCache.get("([^\\\\])([\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s)

public String getCleanedUnicodeSet(UnicodeSet exemplar, ExemplarType exemplarType) {

if (rawFormatter == null) {
Original file line number Diff line number Diff line change
@@ -2091,6 +2091,9 @@ public void handlePath(String xpath) {
TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT));
CLDRFile resolved;

Set<String> fakeKeywordPaths = new TreeSet<>();
Set<String> realKeywordPaths = new TreeSet<>();

@Override
public void handleStart() {
String localeID = cldrFileToFilter.getLocaleID();
@@ -2114,12 +2117,14 @@ public void handlePath(String xpath) {
XPathParts parts = XPathParts.getFrozenInstance(fullpath);
String type = parts.getAttributeValue(2, "type");
if (type == null) {
realKeywordPaths.add(xpath);
return; // no TTS, so keywords, skip
}

XPathParts keywordParts = parts.cloneAsThawed().removeAttribute(2, "type");
String keywordPath =
CLDRFile.getDistinguishingXPath(keywordParts.toString(), null);
String keywordPath = keywordParts.toString();
fakeKeywordPaths.add(keywordPath);
String distinguishingKeywordPath =
CLDRFile.getDistinguishingXPath(keywordPath, null);
String rawKeywordValue = cldrFileToFilter.getStringValue(keywordPath);

// skip if keywords AND name are inherited
@@ -2140,7 +2145,7 @@ public void handlePath(String xpath) {

String name = resolved.getStringValue(xpath);
String keywordValue = resolved.getStringValue(keywordPath);
String sourceLocaleId = resolved.getSourceLocaleID(keywordPath, null);
String sourceLocaleId = resolved.getSourceLocaleID(distinguishingKeywordPath, null);
sorted.clear();
sorted.add(name);

@@ -2151,14 +2156,30 @@ public void handlePath(String xpath) {
sorted.addAll(items);
}
DisplayAndInputProcessor.filterCoveredKeywords(sorted);
// TODO: Also filter items that are duplicates except for case
// Reference: https://unicode-org.atlassian.net/browse/CLDR-16972
// DisplayAndInputProcessor.filterKeywordsDifferingOnlyInCase(sorted);
DisplayAndInputProcessor.filterKeywordsDifferingOnlyInCase(sorted);
String newKeywordValue = Joiner.on(" | ").join(sorted);
if (!newKeywordValue.equals(keywordValue)) {
replace(keywordPath, keywordPath, newKeywordValue);
}
}

@Override
public void handleEnd() {
if (fakeKeywordPaths.isEmpty() || realKeywordPaths.isEmpty()) {
throw new RuntimeException("fake/real EMPTY loc: " + cldrFileToFilter.getLocaleID());
}
if (!fakeKeywordPaths.equals(realKeywordPaths)) {
fakeKeywordPaths.removeAll(realKeywordPaths);
realKeywordPaths.removeAll(fakeKeywordPaths);
for (String p : fakeKeywordPaths) {
System.out.println("ONLY fake: " + p + " loc: " + cldrFileToFilter.getLocaleID());
}
for (String p : realKeywordPaths) {
System.out.println("ONLY real: " + p + " loc: " + cldrFileToFilter.getLocaleID());
}
// throw new RuntimeException("fake/real diff loc: " + cldrFileToFilter.getLocaleID());
}
}
});

fixList.add(
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@
import com.ibm.icu.lang.CharSequences;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import java.util.Arrays;
import java.util.Set;
import java.util.TreeSet;
import org.unicode.cldr.test.DisplayAndInputProcessor;
@@ -817,4 +818,49 @@ public void TestFilterCoveredKeywords() {
}
}
}

private class KeywordCaseTestData {
String[] array, expectedArray;

KeywordCaseTestData(String[] array, String[] expectedArray) {
this.array = array;
this.expectedArray = expectedArray;
}

boolean filtersAsExpected() {
TreeSet<String> set = new TreeSet<>(Arrays.asList(array));
TreeSet<String> expectedSet = new TreeSet<>(Arrays.asList(expectedArray));
DisplayAndInputProcessor.filterKeywordsDifferingOnlyInCase(set);
if (set.equals(expectedSet)) {
return true;
} else {
errln("Resulting set " + set + " differs from expected set " + expectedSet);
return false;
}
}
}

public void TestFilterKeywordsDifferingOnlyInCase() {
String[] array = new String[] {"BEAR", "Bear", "PANDA", "Panda", "panda"};
String[] expectedArray = new String[] {"BEAR", "PANDA"};
KeywordCaseTestData dat = new KeywordCaseTestData(array, expectedArray);
if (!dat.filtersAsExpected()) {
errln("Resulting set differs from expected set 1");
}
array =
new String[] {
"gebou", "Japannees", "japanse poskantoor", "Japanse poskantoor", "pos"
};
expectedArray = new String[] {"gebou", "Japannees", "Japanse poskantoor", "pos"};
dat = new KeywordCaseTestData(array, expectedArray);
if (!dat.filtersAsExpected()) {
errln("Resulting set differs from expected set 2");
}
array = new String[] {"Aa", "Bb", "Cc", "Dd", "行"}; // should not change
expectedArray = array;
dat = new KeywordCaseTestData(array, expectedArray);
if (!dat.filtersAsExpected()) {
errln("Resulting set differs from expected set 3");
}
}
}