Skip to content

Commit

Permalink
CLDR-17153 Investigate discrepancy between growth and coverage charts
Browse files Browse the repository at this point in the history
  • Loading branch information
macchiati committed Oct 5, 2023
1 parent bb2bdcd commit f20bee6
Show file tree
Hide file tree
Showing 3 changed files with 248 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@
import org.unicode.cldr.util.Counter2;
import org.unicode.cldr.util.Level;
import org.unicode.cldr.util.LocaleNames;
import org.unicode.cldr.util.Organization;
import org.unicode.cldr.util.PathHeader;
import org.unicode.cldr.util.PathHeader.Factory;
import org.unicode.cldr.util.PatternCache;
import org.unicode.cldr.util.RegexLookup;
import org.unicode.cldr.util.RegexLookup.LookupType;
import org.unicode.cldr.util.SimpleFactory;
import org.unicode.cldr.util.StandardCodes;
import org.unicode.cldr.util.SupplementalDataInfo;
import org.unicode.cldr.util.VettingViewer;
import org.unicode.cldr.util.VettingViewer.MissingStatus;
Expand All @@ -51,6 +53,8 @@ public class ChartLocaleGrowth {
private static CLDRConfig testInfo = ToolConfig.getToolInstance();
private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO =
testInfo.getSupplementalDataInfo();
static final Set<String> CldrModernLocales =
StandardCodes.make().getLocaleCoverageLocales(Organization.cldr, Set.of(Level.MODERN));

private static org.unicode.cldr.util.Factory factory =
testInfo.getCommonAndSeedAndMainAndAnnotationsFactory();
Expand Down Expand Up @@ -79,7 +83,11 @@ public class ChartLocaleGrowth {
static final Options myOptions = new Options();

private enum MyOptions {
filter(".+", ".*", "Filter the information based on id, using a regex argument."),
filter(".+", ".*", "Filter the information based on locale, using a regex argument."),
Versions(
".+",
".*",
"Filter the information based on cldr version, using a regex argument."),
// draftStatus(".+", "unconfirmed", "Filter the information to a minimum draft status."),
;

Expand All @@ -95,25 +103,35 @@ private enum MyOptions {
public static void main(String[] args) throws IOException {
myOptions.parse(MyOptions.filter, args, true);

Matcher matcher = PatternCache.get(MyOptions.filter.option.getValue()).matcher("");
Matcher localeMatcher = PatternCache.get(MyOptions.filter.option.getValue()).matcher("");
Matcher versionMatcher = PatternCache.get(MyOptions.Versions.option.getValue()).matcher("");

try (PrintWriter out =
FileUtilities.openUTF8Writer(
CLDRPaths.CHART_DIRECTORY + "tsv/", "locale-growth.tsv")) {
doGrowth(matcher, out);
doGrowth(localeMatcher, versionMatcher, out);
return;
}
}

private static void doGrowth(Matcher matcher, PrintWriter out) {
private static void doGrowth(Matcher localeMatcher, Matcher versionMatcher, PrintWriter out) {
TreeMap<String, List<Double>> growthData =
new TreeMap<>(Ordering.natural().reverse()); // sort by version, descending
Map<String, FoundAndTotal> latestData = null;
ReleaseInfo last = versionToYear.get(0);
for (ReleaseInfo versionNormalizedVersionAndYear : versionToYear) {
if (versionMatcher != null
&& !versionMatcher
.reset(versionNormalizedVersionAndYear.version.getVersionString(1, 2))
.matches()) {
continue;
}
VersionInfo version = versionNormalizedVersionAndYear.version;
int year = versionNormalizedVersionAndYear.year;
String dir = ToolConstants.getBaseDirectory(version.getVersionString(2, 3));
Map<String, FoundAndTotal> currentData = addGrowth(factory, dir, matcher, false);
boolean showMissing = last == versionNormalizedVersionAndYear;
Map<String, FoundAndTotal> currentData =
addGrowth(factory, dir, localeMatcher, showMissing);
long found = 0;
long total = 0;
for (Entry<String, FoundAndTotal> entry : currentData.entrySet()) {
Expand Down Expand Up @@ -258,7 +276,7 @@ public String toString() {
private static Map<String, FoundAndTotal> addGrowth(
org.unicode.cldr.util.Factory latestFactory,
String dir,
Matcher matcher,
Matcher localeMatcher,
boolean showMissing) {
final File mainDir = new File(dir + "/common/main/");
final File annotationDir = new File(dir + "/common/annotations/");
Expand All @@ -273,8 +291,9 @@ private static Map<String, FoundAndTotal> addGrowth(
Map<String, FoundAndTotal> data = new HashMap<>();
char c = 0;
Set<String> latestAvailable = newFactory.getAvailableLanguages();
boolean firstShowMissing = true;
for (String locale : newFactory.getAvailableLanguages()) {
if (!matcher.reset(locale).matches()) {
if (!localeMatcher.reset(locale).matches()) {
continue;
}
if (!latestAvailable.contains(locale)) {
Expand Down Expand Up @@ -334,6 +353,43 @@ private static Map<String, FoundAndTotal> addGrowth(
missingPaths,
unconfirmedPaths);

if (showMissing) {
if (CldrModernLocales.contains(locale)) {
if (firstShowMissing) {
firstShowMissing = false;
System.out.println(
"\nLocale"
+ "\tCore\tUnc\tMiss"
+ "\tBasic\tUnc\tMiss"
+ "\tModer\tUnc\tMiss"
+ "\tModern\tUnc\tMiss");
}
System.out.println(
locale
+ show(
Level.CORE,
foundCounter,
unconfirmedCounter,
missingCounter)
+ show(
Level.BASIC,
foundCounter,
unconfirmedCounter,
missingCounter)
+ show(
Level.MODERATE,
foundCounter,
unconfirmedCounter,
missingCounter)
+ show(
Level.MODERN,
foundCounter,
unconfirmedCounter,
missingCounter));
int line = 0;
}
}

// HACK
Set<Entry<MissingStatus, String>> missingRemovals = new HashSet<>();
for (Entry<MissingStatus, String> e : missingPaths.keyValueSet()) {
Expand All @@ -355,7 +411,7 @@ private static Map<String, FoundAndTotal> addGrowth(
}
// END HACK

if (showMissing) {
if (false && showMissing) {
int count = 0;
for (String s : unconfirmedPaths) {
System.out.println(
Expand All @@ -382,4 +438,18 @@ private static Map<String, FoundAndTotal> addGrowth(
}
return Collections.unmodifiableMap(data);
}

/** "\tCore\tUnc\tMiss" */
private static String show(
Level level,
Counter<Level> foundCounter,
Counter<Level> unconfirmedCounter,
Counter<Level> missingCounter) {
return "\t"
+ foundCounter.get(level)
+ "\t"
+ unconfirmedCounter.get(level)
+ "\t"
+ missingCounter.get(level);
}
}
167 changes: 167 additions & 0 deletions tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareEmoji.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
package org.unicode.cldr.tool;

import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Collator;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.Emoji;
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.SimpleFactory;
import org.unicode.cldr.util.XPathParts;

public class CompareEmoji {
private static final Splitter BAR_SPLITTER = Splitter.on("|").trimResults().omitEmptyStrings();
static final CLDRConfig CONFIG = CLDRConfig.getInstance();
static final Factory FACTORY = CONFIG.getAnnotationsFactory();
private static final File[] paths = {new File(CLDRPaths.ANNOTATIONS_DERIVED_DIRECTORY)};
static final Factory FACTORY_DERIVED = SimpleFactory.make(paths, ".*");

private static final Joiner BAR_JOINER = Joiner.on(" | ");
private static final Collator collator = CLDRConfig.getInstance().getCollator();
private static final String base =
"/Users/markdavis/github/private/DATA/cldr-private/emoji_diff/";
private static final Set<String> sorted =
ImmutableSet.copyOf(Emoji.getAllRgi().addAllTo(new TreeSet<>(collator)));

enum Status {
regular,
constructed,
missing;

char abbreviation() {
return Character.toUpperCase(name().charAt(0));
}
}

private static class EmojiData {
String shortName;
Set<String> searchKeywords;
Status status;

@Override
public String toString() {
return shortName + "; " + searchKeywords + "; " + status;
}
}

public static void main(String[] args) throws IOException {
final String locale = "zh_Hant";

Map<String, EmojiData> annotations = getDataFor(locale);

Map<String, Set<String>> removed = loadItems(locale, "_removed.csv", new HashMap<>());
Map<String, Set<String>> added = loadItems(locale, "_added.csv", new HashMap<>());

int count = 0;
System.out.println("No.\tEmoji\tType\tName\tCommon\tRemoved\tAdded");
for (String key : sorted) {
String minimal = key.replace(Emoji.EMOJI_VARIANT, "");
EmojiData v = annotations.get(minimal);
Set<String> commonSet;
String shortName;
Status status;
if (v == null) {
commonSet = Set.of();
shortName = "<constructed>";
status = Status.missing;
} else {
commonSet = v.searchKeywords;
shortName = v.shortName;
status = v.status;
}

Set<String> removedSet = removed.get(key);
Set<String> addedSet = added.get(key);
if (removedSet == null && addedSet == null) {
continue;
}
if (removedSet != null) {
commonSet = Sets.difference(commonSet, removedSet);
}
System.out.println(
++count //
+ "\t"
+ key //
+ "\t"
+ status.abbreviation() //
+ "\t"
+ shortName //
+ "\t"
+ BAR_JOINER.join(commonSet) //
+ "\t"
+ (removedSet == null ? "" : BAR_JOINER.join(removedSet)) //
+ "\t"
+ (addedSet == null ? "" : BAR_JOINER.join(addedSet)) //
);
}
}

private static Map<String, EmojiData> getDataFor(String locale) {
Map<String, EmojiData> result = new HashMap<>();
CLDRFile cldrfile = FACTORY.make(locale, true);
getDataIn(cldrfile, result, Status.regular);
CLDRFile cldrfileDerived = FACTORY_DERIVED.make(locale, true);
getDataIn(cldrfileDerived, result, Status.constructed);
return result;
}

public static void getDataIn(CLDRFile cldrfile, Map<String, EmojiData> result, Status status) {
for (String path : cldrfile) {
XPathParts parts = XPathParts.getFrozenInstance(path);
String cp = parts.getAttributeValue(-1, "cp");
if (cp == null) {
continue;
}
EmojiData record = result.get(cp);
if (record == null) {
result.put(cp, record = new EmojiData());
record.status = status;
}
boolean istts = parts.getAttributeValue(-1, "type") != null;
String value = cldrfile.getStringValue(path);
if (istts) {
record.shortName = value;
} else {
record.searchKeywords = ImmutableSet.copyOf(BAR_SPLITTER.splitToList(value));
}
}
}

public static Map<String, Set<String>> loadItems(
String locale, String suffix, Map<String, Set<String>> result) throws IOException {
try (BufferedReader reader = FileUtilities.openUTF8Reader(base, locale + suffix)) {
while (true) {
String line = reader.readLine();
if (line == null) {
return result;
}
if (line.startsWith("Emoji,")) {
continue;
}
String[] split = FileUtilities.splitCommaSeparated(line);
if (split.length < 2) {
continue;
}
String key = split[0];
Set<String> values = new TreeSet<>(collator);
for (int i = 1; i < split.length; ++i) {
values.add(split[i]);
}
values = ImmutableSet.copyOf(values);
result.put(key, values);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@
public class ShowLocaleCoverage {

private static final String TSV_BASE =
"https://github.com/unicode-org/cldr-staging/blob/main/docs/charts/43/tsv/";
"https://github.com/unicode-org/cldr-staging/blob/main/docs/charts/"
+ ToolConstants.CHART_VI.getVersionString(1, 2)
+ "/tsv/";
private static final Splitter LF_SPLITTER = Splitter.on('\n');

// thresholds for measuring Level attainment
Expand Down

0 comments on commit f20bee6

Please sign in to comment.