From 3bea38341ea4e6ec5f86559531c160855ad7a33a Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Wed, 18 Oct 2023 19:03:36 +0200 Subject: [PATCH] CLDR-17153 Investigate discrepancy between growth and coverage charts (#3324) * CLDR-17153 Investigate discrepancy between growth and coverage charts * CLDR-17153 Cleanup --- .../unicode/cldr/tool/ChartLocaleGrowth.java | 147 +++++++++++++-- .../org/unicode/cldr/tool/CompareEmoji.java | 167 ++++++++++++++++++ .../unicode/cldr/tool/ShowLocaleCoverage.java | 33 +--- .../unicode/cldr/util/TempPrintWriter.java | 20 +++ 4 files changed, 325 insertions(+), 42 deletions(-) create mode 100644 tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareEmoji.java diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ChartLocaleGrowth.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ChartLocaleGrowth.java index e059383d136..c0f517bd4db 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ChartLocaleGrowth.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ChartLocaleGrowth.java @@ -7,7 +7,6 @@ import com.ibm.icu.util.VersionInfo; import java.io.File; import java.io.IOException; -import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -21,7 +20,6 @@ import java.util.Set; import java.util.TreeMap; import java.util.regex.Matcher; -import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.tool.Option.Options; import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; @@ -32,13 +30,16 @@ import org.unicode.cldr.util.Counter2; import org.unicode.cldr.util.Level; import org.unicode.cldr.util.LocaleNames; +import org.unicode.cldr.util.Organization; import org.unicode.cldr.util.PathHeader; import org.unicode.cldr.util.PathHeader.Factory; import org.unicode.cldr.util.PatternCache; import org.unicode.cldr.util.RegexLookup; import org.unicode.cldr.util.RegexLookup.LookupType; import org.unicode.cldr.util.SimpleFactory; +import org.unicode.cldr.util.StandardCodes; import org.unicode.cldr.util.SupplementalDataInfo; +import org.unicode.cldr.util.TempPrintWriter; import org.unicode.cldr.util.VettingViewer; import org.unicode.cldr.util.VettingViewer.MissingStatus; @@ -51,6 +52,11 @@ public class ChartLocaleGrowth { private static CLDRConfig testInfo = ToolConfig.getToolInstance(); private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = testInfo.getSupplementalDataInfo(); + static final Set CldrModernLocales = + StandardCodes.make().getLocaleCoverageLocales(Organization.cldr, Set.of(Level.MODERN)); + static final Set SpecialLocales = + StandardCodes.make() + .getLocaleCoverageLocales(Organization.special, Set.of(Level.MODERN)); private static org.unicode.cldr.util.Factory factory = testInfo.getCommonAndSeedAndMainAndAnnotationsFactory(); @@ -79,7 +85,11 @@ public class ChartLocaleGrowth { static final Options myOptions = new Options(); private enum MyOptions { - filter(".+", ".*", "Filter the information based on id, using a regex argument."), + filter(".+", ".*", "Filter the information based on locale, using a regex argument."), + Versions( + ".+", + ".*", + "Filter the information based on cldr version, using a regex argument."), // draftStatus(".+", "unconfirmed", "Filter the information to a minimum draft status."), ; @@ -95,25 +105,46 @@ private enum MyOptions { public static void main(String[] args) throws IOException { myOptions.parse(MyOptions.filter, args, true); - Matcher matcher = PatternCache.get(MyOptions.filter.option.getValue()).matcher(""); - - try (PrintWriter out = - FileUtilities.openUTF8Writer( - CLDRPaths.CHART_DIRECTORY + "tsv/", "locale-growth.tsv")) { - doGrowth(matcher, out); + Matcher localeMatcher = PatternCache.get(MyOptions.filter.option.getValue()).matcher(""); + Matcher versionMatcher = PatternCache.get(MyOptions.Versions.option.getValue()).matcher(""); + + try (TempPrintWriter out = + new TempPrintWriter( + CLDRPaths.CHART_DIRECTORY + "tsv/", "locale-growth.tsv"); + TempPrintWriter log = + new TempPrintWriter( + CLDRPaths.CHART_DIRECTORY + "tsv/", "locale-growth-log.tsv"); + TempPrintWriter logPaths = + new TempPrintWriter( + CLDRPaths.CHART_DIRECTORY + "tsv/", "locale-growth-paths.tsv"); ) { + doGrowth(localeMatcher, versionMatcher, out, log, logPaths); return; } } - private static void doGrowth(Matcher matcher, PrintWriter out) { + private static void doGrowth( + Matcher localeMatcher, + Matcher versionMatcher, + TempPrintWriter out, + TempPrintWriter log, + TempPrintWriter logPaths) { TreeMap> growthData = new TreeMap<>(Ordering.natural().reverse()); // sort by version, descending Map latestData = null; + ReleaseInfo last = versionToYear.get(0); for (ReleaseInfo versionNormalizedVersionAndYear : versionToYear) { + if (versionMatcher != null + && !versionMatcher + .reset(versionNormalizedVersionAndYear.version.getVersionString(1, 2)) + .matches()) { + continue; + } VersionInfo version = versionNormalizedVersionAndYear.version; int year = versionNormalizedVersionAndYear.year; String dir = ToolConstants.getBaseDirectory(version.getVersionString(2, 3)); - Map currentData = addGrowth(factory, dir, matcher, false); + boolean showMissing = last == versionNormalizedVersionAndYear; + Map currentData = + addGrowth(factory, dir, localeMatcher, showMissing, log, logPaths); long found = 0; long total = 0; for (Entry entry : currentData.entrySet()) { @@ -169,7 +200,7 @@ private ReleaseInfo(VersionInfo versionInfo, int year) { static { Object[][] mapping = { - {VersionInfo.getInstance(43), 2023}, + {VersionInfo.getInstance(44), 2023}, {VersionInfo.getInstance(42), 2022}, {VersionInfo.getInstance(40), 2021}, {VersionInfo.getInstance(38), 2020}, @@ -258,8 +289,10 @@ public String toString() { private static Map addGrowth( org.unicode.cldr.util.Factory latestFactory, String dir, - Matcher matcher, - boolean showMissing) { + Matcher localeMatcher, + boolean showMissing, + TempPrintWriter log, + TempPrintWriter logPaths) { final File mainDir = new File(dir + "/common/main/"); final File annotationDir = new File(dir + "/common/annotations/"); File[] paths = @@ -273,8 +306,9 @@ private static Map addGrowth( Map data = new HashMap<>(); char c = 0; Set latestAvailable = newFactory.getAvailableLanguages(); + boolean firstShowMissing = true; for (String locale : newFactory.getAvailableLanguages()) { - if (!matcher.reset(locale).matches()) { + if (!localeMatcher.reset(locale).matches()) { continue; } if (!latestAvailable.contains(locale)) { @@ -334,6 +368,73 @@ private static Map addGrowth( missingPaths, unconfirmedPaths); + if (showMissing) { + if (CldrModernLocales.contains(locale)) { + final boolean isSpecial = SpecialLocales.contains(locale); + if (firstShowMissing) { + firstShowMissing = false; + log.printlnWithTabs( + 16, + "Locale\tTC" + + "\tCore\tUnc\tMiss" + + "\tBasic\tUnc\tMiss" + + "\tModer\tUnc\tMiss" + + "\tModern\tUnc\tMiss" + + "\tTotal\tUnc\tMiss"); + logPaths.printlnWithTabs(3, "Locale\tLevel\tStatus\tPath"); + } + log.printlnWithTabs( + 16, + locale + + "\t" + + (isSpecial ? "" : "TC") + + show( + Level.CORE, + foundCounter, + unconfirmedCounter, + missingCounter) + + show( + Level.BASIC, + foundCounter, + unconfirmedCounter, + missingCounter) + + show( + Level.MODERATE, + foundCounter, + unconfirmedCounter, + missingCounter) + + show( + Level.MODERN, + foundCounter, + unconfirmedCounter, + missingCounter) + + show( + null, // total + foundCounter, + unconfirmedCounter, + missingCounter)); + if (!isSpecial) { + long count = unconfirmedCounter.getTotal() + missingCounter.getTotal(); + for (Entry statusAndPath : missingPaths.entrySet()) { + logPaths.printlnWithTabs( + 3, + locale + + "\t" + + count + + "\t" + + statusAndPath.getKey() + + "\t" + + statusAndPath.getValue()); + } + for (String path : unconfirmedPaths) { + logPaths.printlnWithTabs( + 3, locale + "\t" + count + "\tunconfirmed\t" + path); + } + } + int line = 0; + } + } + // HACK Set> missingRemovals = new HashSet<>(); for (Entry e : missingPaths.keyValueSet()) { @@ -355,7 +456,7 @@ private static Map addGrowth( } // END HACK - if (showMissing) { + if (false && showMissing) { int count = 0; for (String s : unconfirmedPaths) { System.out.println( @@ -382,4 +483,18 @@ private static Map addGrowth( } return Collections.unmodifiableMap(data); } + + /** "\tCore\tUnc\tMiss" */ + private static String show( + Level level, + Counter foundCounter, + Counter unconfirmedCounter, + Counter missingCounter) { + return "\t" + + (level != null ? foundCounter.get(level) : foundCounter.getTotal()) + + "\t" + + (level != null ? unconfirmedCounter.get(level) : unconfirmedCounter.getTotal()) + + "\t" + + (level != null ? missingCounter.get(level) : missingCounter.getTotal()); + } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareEmoji.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareEmoji.java new file mode 100644 index 00000000000..ab807fd4067 --- /dev/null +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CompareEmoji.java @@ -0,0 +1,167 @@ +package org.unicode.cldr.tool; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; +import com.ibm.icu.text.Collator; +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import org.unicode.cldr.draft.FileUtilities; +import org.unicode.cldr.util.CLDRConfig; +import org.unicode.cldr.util.CLDRFile; +import org.unicode.cldr.util.CLDRPaths; +import org.unicode.cldr.util.Emoji; +import org.unicode.cldr.util.Factory; +import org.unicode.cldr.util.SimpleFactory; +import org.unicode.cldr.util.XPathParts; + +public class CompareEmoji { + private static final Splitter BAR_SPLITTER = Splitter.on("|").trimResults().omitEmptyStrings(); + static final CLDRConfig CONFIG = CLDRConfig.getInstance(); + static final Factory FACTORY = CONFIG.getAnnotationsFactory(); + private static final File[] paths = {new File(CLDRPaths.ANNOTATIONS_DERIVED_DIRECTORY)}; + static final Factory FACTORY_DERIVED = SimpleFactory.make(paths, ".*"); + + private static final Joiner BAR_JOINER = Joiner.on(" | "); + private static final Collator collator = CLDRConfig.getInstance().getCollator(); + private static final String base = + "/Users/markdavis/github/private/DATA/cldr-private/emoji_diff/"; + private static final Set sorted = + ImmutableSet.copyOf(Emoji.getAllRgi().addAllTo(new TreeSet<>(collator))); + + enum Status { + regular, + constructed, + missing; + + char abbreviation() { + return Character.toUpperCase(name().charAt(0)); + } + } + + private static class EmojiData { + String shortName; + Set searchKeywords; + Status status; + + @Override + public String toString() { + return shortName + "; " + searchKeywords + "; " + status; + } + } + + public static void main(String[] args) throws IOException { + final String locale = "zh_Hant"; + + Map annotations = getDataFor(locale); + + Map> removed = loadItems(locale, "_removed.csv", new HashMap<>()); + Map> added = loadItems(locale, "_added.csv", new HashMap<>()); + + int count = 0; + System.out.println("No.\tEmoji\tType\tName\tCommon\tRemoved\tAdded"); + for (String key : sorted) { + String minimal = key.replace(Emoji.EMOJI_VARIANT, ""); + EmojiData v = annotations.get(minimal); + Set commonSet; + String shortName; + Status status; + if (v == null) { + commonSet = Set.of(); + shortName = ""; + status = Status.missing; + } else { + commonSet = v.searchKeywords; + shortName = v.shortName; + status = v.status; + } + + Set removedSet = removed.get(key); + Set addedSet = added.get(key); + if (removedSet == null && addedSet == null) { + continue; + } + if (removedSet != null) { + commonSet = Sets.difference(commonSet, removedSet); + } + System.out.println( + ++count // + + "\t" + + key // + + "\t" + + status.abbreviation() // + + "\t" + + shortName // + + "\t" + + BAR_JOINER.join(commonSet) // + + "\t" + + (removedSet == null ? "" : BAR_JOINER.join(removedSet)) // + + "\t" + + (addedSet == null ? "" : BAR_JOINER.join(addedSet)) // + ); + } + } + + private static Map getDataFor(String locale) { + Map result = new HashMap<>(); + CLDRFile cldrfile = FACTORY.make(locale, true); + getDataIn(cldrfile, result, Status.regular); + CLDRFile cldrfileDerived = FACTORY_DERIVED.make(locale, true); + getDataIn(cldrfileDerived, result, Status.constructed); + return result; + } + + public static void getDataIn(CLDRFile cldrfile, Map result, Status status) { + for (String path : cldrfile) { + XPathParts parts = XPathParts.getFrozenInstance(path); + String cp = parts.getAttributeValue(-1, "cp"); + if (cp == null) { + continue; + } + EmojiData record = result.get(cp); + if (record == null) { + result.put(cp, record = new EmojiData()); + record.status = status; + } + boolean istts = parts.getAttributeValue(-1, "type") != null; + String value = cldrfile.getStringValue(path); + if (istts) { + record.shortName = value; + } else { + record.searchKeywords = ImmutableSet.copyOf(BAR_SPLITTER.splitToList(value)); + } + } + } + + public static Map> loadItems( + String locale, String suffix, Map> result) throws IOException { + try (BufferedReader reader = FileUtilities.openUTF8Reader(base, locale + suffix)) { + while (true) { + String line = reader.readLine(); + if (line == null) { + return result; + } + if (line.startsWith("Emoji,")) { + continue; + } + String[] split = FileUtilities.splitCommaSeparated(line); + if (split.length < 2) { + continue; + } + String key = split[0]; + Set values = new TreeSet<>(collator); + for (int i = 1; i < split.length; ++i) { + values.add(split[i]); + } + values = ImmutableSet.copyOf(values); + result.put(key, values); + } + } + } +} diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowLocaleCoverage.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowLocaleCoverage.java index f9cd926013c..01c700eadf5 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowLocaleCoverage.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ShowLocaleCoverage.java @@ -67,8 +67,10 @@ public class ShowLocaleCoverage { private static final String TSV_BASE = - "https://github.com/unicode-org/cldr-staging/blob/main/docs/charts/43/tsv/"; - private static final Splitter LF_SPLITTER = Splitter.on('\n'); + "https://github.com/unicode-org/cldr-staging/blob/main/docs/charts/" + + ToolConstants.CHART_VI.getVersionString(1, 2) + + "/tsv/"; + public static final Splitter LF_SPLITTER = Splitter.on('\n'); // thresholds for measuring Level attainment private static final double BASIC_THRESHOLD = 1; @@ -341,7 +343,7 @@ private static void showCoverage( tsv_missing_counts.println(TSV_MISSING_COUNTS_HEADER); final int propertiesCoverageTabCount = 2; - printlnWithTabs(propertiesCoverage, propertiesCoverageTabCount, PROPERTIES_HEADER); + propertiesCoverage.printlnWithTabs(propertiesCoverageTabCount, PROPERTIES_HEADER); Set checkModernLocales = STANDARD_CODES.getLocaleCoverageLocales( @@ -906,8 +908,7 @@ private static void showCoverage( // now write properties file line if (computed != Level.UNDETERMINED) { - printlnWithTabs( - propertiesCoverage, + propertiesCoverage.printlnWithTabs( propertiesCoverageTabCount, locale + " ;\t" @@ -947,7 +948,7 @@ private static void showCoverage( } } String lineToPrint = "\n#EOF"; - printlnWithTabs(propertiesCoverage, propertiesCoverageTabCount, lineToPrint); + propertiesCoverage.printlnWithTabs(propertiesCoverageTabCount, lineToPrint); pw.println("

Main Table

"); pw.println(tablePrinter.toTable()); @@ -1063,26 +1064,6 @@ private static void showCoverage( } } - /** Println with extra tabs to appear as table in github */ - public static void printlnWithTabs( - TempPrintWriter printWriter, int desiredCount, String textToPrint) { - StringBuilder result = new StringBuilder(); - for (String line : LF_SPLITTER.split(textToPrint)) { - long count = desiredCount - line.chars().filter(ch -> ch == '\t').count(); - if (count < 0) { - throw new IllegalArgumentException("Too many tabs in line."); - } - result.append(line); - if (count != 0) { - for (int i = 0; i < count; ++i) { - result.append('\t'); - } - } - result.append('\n'); - } - printWriter.print(result); - } - private static String linkTsv(String tsvFileName) { return "" + tsvFileName + ""; } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/TempPrintWriter.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/TempPrintWriter.java index c46401951b2..65c90a06646 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/TempPrintWriter.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/TempPrintWriter.java @@ -12,6 +12,7 @@ import java.nio.file.StandardCopyOption; import java.util.Random; import org.unicode.cldr.draft.FileUtilities; +import org.unicode.cldr.tool.ShowLocaleCoverage; /** * Simple utility to create a temporary file, write into it, then close it. If the file differs from @@ -105,6 +106,25 @@ public void println() { tempPrintWriter.println(); } + /** Println with extra tabs on each line, as needed, to appear as table in github */ + public void printlnWithTabs(int desiredCount, String textToPrint) { + StringBuilder result = new StringBuilder(); + for (String line : ShowLocaleCoverage.LF_SPLITTER.split(textToPrint)) { + long count = desiredCount - line.chars().filter(ch -> ch == '\t').count(); + if (count < 0) { + throw new IllegalArgumentException("Too many tabs in line."); + } + result.append(line); + if (count != 0) { + for (int i = 0; i < count; ++i) { + result.append('\t'); + } + } + result.append('\n'); + } + print(result); + } + /** * If contents(newFile) ≠ contents(oldFile), rename newFile to old. Otherwise delete newfile. * Return true if replaced. *