From f20bee626b0fe84e8570c60dffc42dab94d1b301 Mon Sep 17 00:00:00 2001 From: macchiati Date: Thu, 5 Oct 2023 11:48:34 +0200 Subject: [PATCH] CLDR-17153 Investigate discrepancy between growth and coverage charts --- .../unicode/cldr/tool/ChartLocaleGrowth.java | 86 ++++++++- .../org/unicode/cldr/tool/CompareEmoji.java | 167 ++++++++++++++++++ .../unicode/cldr/tool/ShowLocaleCoverage.java | 4 +- 3 files changed, 248 insertions(+), 9 deletions(-) create mode 100644 tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareEmoji.java diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ChartLocaleGrowth.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ChartLocaleGrowth.java index e059383d136..4193b041099 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ChartLocaleGrowth.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ChartLocaleGrowth.java @@ -32,12 +32,14 @@ import org.unicode.cldr.util.Counter2; import org.unicode.cldr.util.Level; import org.unicode.cldr.util.LocaleNames; +import org.unicode.cldr.util.Organization; import org.unicode.cldr.util.PathHeader; import org.unicode.cldr.util.PathHeader.Factory; import org.unicode.cldr.util.PatternCache; import org.unicode.cldr.util.RegexLookup; import org.unicode.cldr.util.RegexLookup.LookupType; import org.unicode.cldr.util.SimpleFactory; +import org.unicode.cldr.util.StandardCodes; import org.unicode.cldr.util.SupplementalDataInfo; import org.unicode.cldr.util.VettingViewer; import org.unicode.cldr.util.VettingViewer.MissingStatus; @@ -51,6 +53,8 @@ public class ChartLocaleGrowth { private static CLDRConfig testInfo = ToolConfig.getToolInstance(); private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = testInfo.getSupplementalDataInfo(); + static final Set CldrModernLocales = + StandardCodes.make().getLocaleCoverageLocales(Organization.cldr, Set.of(Level.MODERN)); private static org.unicode.cldr.util.Factory factory = testInfo.getCommonAndSeedAndMainAndAnnotationsFactory(); @@ -79,7 +83,11 @@ public class ChartLocaleGrowth { static final Options myOptions = new Options(); private enum MyOptions { - filter(".+", ".*", "Filter the information based on id, using a regex argument."), + filter(".+", ".*", "Filter the information based on locale, using a regex argument."), + Versions( + ".+", + ".*", + "Filter the information based on cldr version, using a regex argument."), // draftStatus(".+", "unconfirmed", "Filter the information to a minimum draft status."), ; @@ -95,25 +103,35 @@ private enum MyOptions { public static void main(String[] args) throws IOException { myOptions.parse(MyOptions.filter, args, true); - Matcher matcher = PatternCache.get(MyOptions.filter.option.getValue()).matcher(""); + Matcher localeMatcher = PatternCache.get(MyOptions.filter.option.getValue()).matcher(""); + Matcher versionMatcher = PatternCache.get(MyOptions.Versions.option.getValue()).matcher(""); try (PrintWriter out = FileUtilities.openUTF8Writer( CLDRPaths.CHART_DIRECTORY + "tsv/", "locale-growth.tsv")) { - doGrowth(matcher, out); + doGrowth(localeMatcher, versionMatcher, out); return; } } - private static void doGrowth(Matcher matcher, PrintWriter out) { + private static void doGrowth(Matcher localeMatcher, Matcher versionMatcher, PrintWriter out) { TreeMap> growthData = new TreeMap<>(Ordering.natural().reverse()); // sort by version, descending Map latestData = null; + ReleaseInfo last = versionToYear.get(0); for (ReleaseInfo versionNormalizedVersionAndYear : versionToYear) { + if (versionMatcher != null + && !versionMatcher + .reset(versionNormalizedVersionAndYear.version.getVersionString(1, 2)) + .matches()) { + continue; + } VersionInfo version = versionNormalizedVersionAndYear.version; int year = versionNormalizedVersionAndYear.year; String dir = ToolConstants.getBaseDirectory(version.getVersionString(2, 3)); - Map currentData = addGrowth(factory, dir, matcher, false); + boolean showMissing = last == versionNormalizedVersionAndYear; + Map currentData = + addGrowth(factory, dir, localeMatcher, showMissing); long found = 0; long total = 0; for (Entry entry : currentData.entrySet()) { @@ -258,7 +276,7 @@ public String toString() { private static Map addGrowth( org.unicode.cldr.util.Factory latestFactory, String dir, - Matcher matcher, + Matcher localeMatcher, boolean showMissing) { final File mainDir = new File(dir + "/common/main/"); final File annotationDir = new File(dir + "/common/annotations/"); @@ -273,8 +291,9 @@ private static Map addGrowth( Map data = new HashMap<>(); char c = 0; Set latestAvailable = newFactory.getAvailableLanguages(); + boolean firstShowMissing = true; for (String locale : newFactory.getAvailableLanguages()) { - if (!matcher.reset(locale).matches()) { + if (!localeMatcher.reset(locale).matches()) { continue; } if (!latestAvailable.contains(locale)) { @@ -334,6 +353,43 @@ private static Map addGrowth( missingPaths, unconfirmedPaths); + if (showMissing) { + if (CldrModernLocales.contains(locale)) { + if (firstShowMissing) { + firstShowMissing = false; + System.out.println( + "\nLocale" + + "\tCore\tUnc\tMiss" + + "\tBasic\tUnc\tMiss" + + "\tModer\tUnc\tMiss" + + "\tModern\tUnc\tMiss"); + } + System.out.println( + locale + + show( + Level.CORE, + foundCounter, + unconfirmedCounter, + missingCounter) + + show( + Level.BASIC, + foundCounter, + unconfirmedCounter, + missingCounter) + + show( + Level.MODERATE, + foundCounter, + unconfirmedCounter, + missingCounter) + + show( + Level.MODERN, + foundCounter, + unconfirmedCounter, + missingCounter)); + int line = 0; + } + } + // HACK Set> missingRemovals = new HashSet<>(); for (Entry e : missingPaths.keyValueSet()) { @@ -355,7 +411,7 @@ private static Map addGrowth( } // END HACK - if (showMissing) { + if (false && showMissing) { int count = 0; for (String s : unconfirmedPaths) { System.out.println( @@ -382,4 +438,18 @@ private static Map addGrowth( } return Collections.unmodifiableMap(data); } + + /** "\tCore\tUnc\tMiss" */ + private static String show( + Level level, + Counter foundCounter, + Counter unconfirmedCounter, + Counter missingCounter) { + return "\t" + + foundCounter.get(level) + + "\t" + + unconfirmedCounter.get(level) + + "\t" + + missingCounter.get(level); + } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareEmoji.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareEmoji.java new file mode 100644 index 00000000000..ab807fd4067 --- /dev/null +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareEmoji.java @@ -0,0 +1,167 @@ +package org.unicode.cldr.tool; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; +import com.ibm.icu.text.Collator; +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import org.unicode.cldr.draft.FileUtilities; +import org.unicode.cldr.util.CLDRConfig; +import org.unicode.cldr.util.CLDRFile; +import org.unicode.cldr.util.CLDRPaths; +import org.unicode.cldr.util.Emoji; +import org.unicode.cldr.util.Factory; +import org.unicode.cldr.util.SimpleFactory; +import org.unicode.cldr.util.XPathParts; + +public class CompareEmoji { + private static final Splitter BAR_SPLITTER = Splitter.on("|").trimResults().omitEmptyStrings(); + static final CLDRConfig CONFIG = CLDRConfig.getInstance(); + static final Factory FACTORY = CONFIG.getAnnotationsFactory(); + private static final File[] paths = {new File(CLDRPaths.ANNOTATIONS_DERIVED_DIRECTORY)}; + static final Factory FACTORY_DERIVED = SimpleFactory.make(paths, ".*"); + + private static final Joiner BAR_JOINER = Joiner.on(" | "); + private static final Collator collator = CLDRConfig.getInstance().getCollator(); + private static final String base = + "/Users/markdavis/github/private/DATA/cldr-private/emoji_diff/"; + private static final Set sorted = + ImmutableSet.copyOf(Emoji.getAllRgi().addAllTo(new TreeSet<>(collator))); + + enum Status { + regular, + constructed, + missing; + + char abbreviation() { + return Character.toUpperCase(name().charAt(0)); + } + } + + private static class EmojiData { + String shortName; + Set searchKeywords; + Status status; + + @Override + public String toString() { + return shortName + "; " + searchKeywords + "; " + status; + } + } + + public static void main(String[] args) throws IOException { + final String locale = "zh_Hant"; + + Map annotations = getDataFor(locale); + + Map> removed = loadItems(locale, "_removed.csv", new HashMap<>()); + Map> added = loadItems(locale, "_added.csv", new HashMap<>()); + + int count = 0; + System.out.println("No.\tEmoji\tType\tName\tCommon\tRemoved\tAdded"); + for (String key : sorted) { + String minimal = key.replace(Emoji.EMOJI_VARIANT, ""); + EmojiData v = annotations.get(minimal); + Set commonSet; + String shortName; + Status status; + if (v == null) { + commonSet = Set.of(); + shortName = ""; + status = Status.missing; + } else { + commonSet = v.searchKeywords; + shortName = v.shortName; + status = v.status; + } + + Set removedSet = removed.get(key); + Set addedSet = added.get(key); + if (removedSet == null && addedSet == null) { + continue; + } + if (removedSet != null) { + commonSet = Sets.difference(commonSet, removedSet); + } + System.out.println( + ++count // + + "\t" + + key // + + "\t" + + status.abbreviation() // + + "\t" + + shortName // + + "\t" + + BAR_JOINER.join(commonSet) // + + "\t" + + (removedSet == null ? "" : BAR_JOINER.join(removedSet)) // + + "\t" + + (addedSet == null ? "" : BAR_JOINER.join(addedSet)) // + ); + } + } + + private static Map getDataFor(String locale) { + Map result = new HashMap<>(); + CLDRFile cldrfile = FACTORY.make(locale, true); + getDataIn(cldrfile, result, Status.regular); + CLDRFile cldrfileDerived = FACTORY_DERIVED.make(locale, true); + getDataIn(cldrfileDerived, result, Status.constructed); + return result; + } + + public static void getDataIn(CLDRFile cldrfile, Map result, Status status) { + for (String path : cldrfile) { + XPathParts parts = XPathParts.getFrozenInstance(path); + String cp = parts.getAttributeValue(-1, "cp"); + if (cp == null) { + continue; + } + EmojiData record = result.get(cp); + if (record == null) { + result.put(cp, record = new EmojiData()); + record.status = status; + } + boolean istts = parts.getAttributeValue(-1, "type") != null; + String value = cldrfile.getStringValue(path); + if (istts) { + record.shortName = value; + } else { + record.searchKeywords = ImmutableSet.copyOf(BAR_SPLITTER.splitToList(value)); + } + } + } + + public static Map> loadItems( + String locale, String suffix, Map> result) throws IOException { + try (BufferedReader reader = FileUtilities.openUTF8Reader(base, locale + suffix)) { + while (true) { + String line = reader.readLine(); + if (line == null) { + return result; + } + if (line.startsWith("Emoji,")) { + continue; + } + String[] split = FileUtilities.splitCommaSeparated(line); + if (split.length < 2) { + continue; + } + String key = split[0]; + Set values = new TreeSet<>(collator); + for (int i = 1; i < split.length; ++i) { + values.add(split[i]); + } + values = ImmutableSet.copyOf(values); + result.put(key, values); + } + } + } +} diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowLocaleCoverage.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowLocaleCoverage.java index f9cd926013c..16c5cd34d51 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowLocaleCoverage.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowLocaleCoverage.java @@ -67,7 +67,9 @@ public class ShowLocaleCoverage { private static final String TSV_BASE = - "https://github.com/unicode-org/cldr-staging/blob/main/docs/charts/43/tsv/"; + "https://github.com/unicode-org/cldr-staging/blob/main/docs/charts/" + + ToolConstants.CHART_VI.getVersionString(1, 2) + + "/tsv/"; private static final Splitter LF_SPLITTER = Splitter.on('\n'); // thresholds for measuring Level attainment