diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java index 9b97f6909c6..23dc80316f4 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java @@ -10,10 +10,12 @@ import com.ibm.icu.impl.Row; import com.ibm.icu.impl.Row.R2; import com.ibm.icu.impl.Row.R3; -import com.ibm.icu.impl.Row.R4; +import com.ibm.icu.impl.locale.XCldrStub.Splitter; +import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.NumberFormat; import java.io.File; -import java.util.Collection; +import java.io.IOException; +import java.io.PrintWriter; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; @@ -23,9 +25,14 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; +import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.draft.ScriptMetadata; import org.unicode.cldr.draft.ScriptMetadata.Info; import org.unicode.cldr.tool.GenerateMaximalLocales.LocaleOverride; +import org.unicode.cldr.tool.GenerateMaximalLocales.LocaleStringComparator; +import org.unicode.cldr.tool.LangTagsData.LSRSource; +import org.unicode.cldr.tool.Option.Options; +import org.unicode.cldr.tool.Option.Params; import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.CLDRLocale; @@ -51,6 +58,7 @@ * GenerateLikelyAdditions. */ public class GenerateLikelySubtags { + private static final Joiner JOIN_TAB = Joiner.on('\t').useForNull("∅"); private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); @@ -62,19 +70,11 @@ public class GenerateLikelySubtags { private static final String DEBUG_ADD_KEY = "und_Latn_ZA"; - private static final boolean SHOW_ADD = - CldrUtility.getProperty("GenerateLikelySubtags_Debug", false); - private static final boolean SUPPRESS_CHANGES = - CldrUtility.getProperty("GenerateMaximalLocalesSuppress", false); - private static final boolean SHOW_CONTAINERS = false; - - private static final boolean SHOW_ALL_LANGUAGE_CODES = false; - private static final boolean SHOW_DETAILED = false; - private static final boolean SHOW_INCLUDED_EXCLUDED = false; - private static final double MIN_UNOFFICIAL_LANGUAGE_SIZE = 10000000; private static final double MIN_UNOFFICIAL_LANGUAGE_PROPORTION = 0.20; private static final double MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE = 100000; + + /** When a language is not official, scale it down. */ private static final double UNOFFICIAL_SCALE_DOWN = 0.2; private static final File list[] = { @@ -91,13 +91,138 @@ public class GenerateLikelySubtags { Relation.of(new HashMap>(), HashSet.class); private static NumberFormat percent = NumberFormat.getPercentInstance(); - private static NumberFormat number = NumberFormat.getIntegerInstance(); + private static NumberFormat integer = NumberFormat.getIntegerInstance(); + + private static boolean DROP_HARDCODED = false; + + private enum MyOptions { + minimize(new Params().setHelp("Show minimization actions")), + add(new Params().setHelp("Show additions")), + population(new Params().setHelp("Show population data used")), + order(new Params().setHelp("Show the priority order for langauge data")), + debug(new Params().setHelp("Show other debug info")), + watch( + new Params() + .setHelp( + "Only show info for locales with listed fields ('|' separated), eg -w419|Aghb|AU|bjt will show info for bjt_Latn or und_Laoo_AU") + .setMatch(".*")), + ; + + // BOILERPLATE TO COPY + final Option option; + + private MyOptions(Params params) { + option = new Option(this, params); + } + + private static Options myOptions = new Options(); + + static { + for (MyOptions option : MyOptions.values()) { + myOptions.add(option, option.option); + } + } + + private static Set parse(String[] args) { + return myOptions.parse(MyOptions.values()[0], args, true); + } + } + + private static boolean SHOW_ADD; + private static boolean SHOW_MIN; + private static boolean SHOW_POP; + private static boolean SHOW_ORDER; + private static boolean DEBUG; + private static Map WATCH_PAIRS = null; + + private static final boolean SHOW_OVERRIDES = true; + + static final Map silData = LangTagsData.getJsonData(); + + public static void main(String[] args) throws IOException { + MyOptions.parse(args); + SHOW_ADD = MyOptions.add.option.doesOccur(); + SHOW_MIN = MyOptions.minimize.option.doesOccur(); + SHOW_POP = MyOptions.population.option.doesOccur(); + SHOW_ORDER = MyOptions.order.option.doesOccur(); + DEBUG = MyOptions.debug.option.doesOccur(); + String watchValues = MyOptions.watch.option.getValue(); + if (watchValues != null) { + Map temp = new TreeMap<>(); + Splitter.on('|') + .split(watchValues) + .forEach(x -> temp.put(x, getTypeFromCasedSubtag(x))); + WATCH_PAIRS = ImmutableMap.copyOf(temp); + } + + Map old = supplementalData.getLikelySubtags(); + Map oldOrigins = supplementalData.getLikelyOrigins(); + System.out.println("origins: " + new TreeSet<>(oldOrigins.values())); + + Map toMaximized = generatePopulationData(new TreeMap<>(LOCALE_SOURCE)); + + Map itemsRemoved = new TreeMap<>(); + + Map result = minimize(toMaximized, itemsRemoved); + + Set newAdditions = new TreeSet(); + Set newMissing = new TreeSet(); + + // Check against last version + + System.out.println(JOIN_TAB.join("Source", "Name", "oldValue", "Name", "newValue", "Name")); + + Set sorted = new TreeSet<>(LOCALE_SOURCE); + sorted.addAll(result.keySet()); + sorted.addAll(old.keySet()); + + for (String source : sorted) { + String oldValue = old.get(source); + String newValue = result.get(source); + String removal = itemsRemoved.get(source); + + if (Objects.equal(oldValue, newValue)) { + continue; + } + + // SKIP the sil values; those will be recreated + + final String origins = oldOrigins.get(source); + if (origins != null && origins.contains("sil1")) { + continue; // skip for now + } + + // skip new values, or oldValues that are specifically removed + + if (oldValue == null || oldValue.equals(removal)) { + continue; // skip for now + } - private static boolean DROP = false; + // special cases + + if (getPart(source, LstrType.language).equals("und") + && oldValue.startsWith("en_Latn")) { + continue; // skip for now + } + + // show the remainder + + System.out.println( + JOIN_TAB.join( + source, + getNameSafe(source), + oldValue, + getNameSafe(oldValue), + newValue, + getNameSafe(newValue))); + } + System.out.println("new missing\t" + newMissing); + + printLikelySubtags(result); + } static { - for (CLDRLocale locale : - ToolConfig.getToolInstance().getCldrFactory().getAvailableCLDRLocales()) { + for (CLDRLocale locale : mainFactory.getAvailableCLDRLocales()) { String region = locale.getCountry(); if (region == null || region.isEmpty() || Containment.isLeaf(region)) { continue; @@ -109,17 +234,17 @@ public class GenerateLikelySubtags { } private static final List KEEP_TARGETS = - DROP ? List.of() : List.of("und_Arab_PK", "und_Latn_ET", "hi_Latn"); + DROP_HARDCODED ? List.of() : List.of("und_Arab_PK", "und_Latn_ET", "hi_Latn"); private static final ImmutableSet deprecatedISONotInLST = - DROP ? ImmutableSet.of() : ImmutableSet.of("scc", "scr"); + DROP_HARDCODED ? ImmutableSet.of() : ImmutableSet.of("scc", "scr"); /** * This is the simplest way to override, by supplying the max value. It gets a very low weight, * so doesn't override any stronger value. */ private static final List MAX_ADDITIONS = - DROP + DROP_HARDCODED ? List.of() : List.of( "bss_Latn_CM", @@ -171,7 +296,7 @@ public class GenerateLikelySubtags { // Many of the overrides below can be removed once the language/pop/country data is updated. private static final Map LANGUAGE_OVERRIDES = CldrUtility.asMap( - DROP + DROP_HARDCODED ? new String[][] { {LocaleNames.UND, "en_Latn_US"}, } @@ -337,7 +462,7 @@ public class GenerateLikelySubtags { // {"tet", "Latn"}, // Tetum (East Timor) // {"tk", "Latn"}, // Turkmen (Turkmenistan) // {"ty", "Latn"}, // Tahitian (French Polynesia) - {LocaleNames.UND, "Latn"}, // Ultimate fallback + // {LocaleNames.UND, "Latn"}, // Ultimate fallback }; private static Map localeToScriptCache = new TreeMap<>(); @@ -370,47 +495,65 @@ public class GenerateLikelySubtags { private static int errorCount; - public static void main(String[] args) { - Map old = supplementalData.getLikelySubtags(); - Map oldOrigins = supplementalData.getLikelyOrigins(); - System.out.println("origins: " + new TreeSet<>(oldOrigins.values())); - - Map toMaximized = generatePopulationData(new TreeMap<>(LOCALE_SOURCE)); - - Map result = minimize(toMaximized); - - Set newAdditions = new TreeSet(); - Set newMissing = new TreeSet(); + /** + * Debugging function that returns false if the flag is false, otherwise returns true if the + * WATCH is null or the locales don't match the WATCH. + * + * @param flag + * @param locales + * @return + */ + static boolean watching(boolean flag, String... locales) { + if (!flag) { + return false; + } + if (WATCH_PAIRS == null) { + return true; + } + for (String locale : locales) { + for (Entry entry : WATCH_PAIRS.entrySet()) { + if (entry.getKey().equals(getPart(locale, entry.getValue()))) { + return true; + } + } + } + return false; + } - System.out.println(JOIN_TAB.join("Source", "Name", "oldValue", "Name", "newValue", "Name")); + /** + * Get the LstrType from well-formed, properly cased LSTR subtag. Otherwise, returns null from + * null, otherwise garbage. + */ + public static LstrType getTypeFromCasedSubtag(String casedSubtag) { + if (casedSubtag == null) { + return null; + } + final char cp0 = casedSubtag.charAt(0); + final char cp1 = casedSubtag.charAt(1); + return cp0 > 'Z' + ? LstrType.language // de + : cp1 > 'Z' + ? LstrType.script // Latn + : LstrType.region; // US, 001 + } - Set sorted = new TreeSet<>(LOCALE_SOURCE); - sorted.addAll(result.keySet()); - sorted.addAll(old.keySet()); + /** Get the part of a locale according to the LstrType */ + public static String getPart(String locale, LstrType lstrType) { + return getPart(CLDRLocale.getInstance(locale), lstrType); + } - for (String source : sorted) { - String oldValue = old.get(source); - String newValue = result.get(source); - if (Objects.equal(oldValue, newValue)) { - continue; - } - final String origins = oldOrigins.get(source); - if (origins != null && origins.contains("sil1")) { - continue; // skip for now - } - if (oldValue == null) { - continue; // skip for now - } - System.out.println( - JOIN_TAB.join( - source, - getNameSafe(source), - oldValue, - getNameSafe(oldValue), - newValue, - getNameSafe(newValue))); + /** Get the part of a locale according to the LstrType */ + public static String getPart(CLDRLocale loc, LstrType type) { + switch (type) { + case language: + return loc.getLanguage(); + case script: + return loc.getScript(); + case region: + return loc.getCountry(); + default: + throw new IllegalArgumentException(type.toString()); } - System.out.println("new missing\t" + newMissing); } /** @@ -427,10 +570,10 @@ public int compare(String locale1, String locale2) { // sort items with 0 components first, then 1, then 2 (there won't be 3) int result = ComparisonChain.start() - // .compare(getCount(l1), getCount(l2)) - .compare(fixUnd(l1.getLanguage()), fixUnd(l2.getLanguage())) - .compare(l1.getScript(), l2.getScript()) - .compare(l1.getCountry(), l2.getCountry()) + // .compare(getMask(l1), getMask(l2)) + .compare(getLanguage(l1), getLanguage(l2)) + .compare(getScript(l1), getScript(l2)) + .compare(getRegion(l1), getRegion(l2)) .result(); if (result == 0 && !locale1.equals(locale2)) { throw new IllegalArgumentException(); @@ -438,16 +581,20 @@ public int compare(String locale1, String locale2) { return result; } - private int getCount(CLDRLocale l1) { - int result = - ("und".equals(l1.getLanguage()) ? 0 : 1) - + (l1.getScript().isEmpty() ? 0 : 1) - + (l1.getCountry().isEmpty() ? 0 : 1); - return result; + private String getLanguage(CLDRLocale loc) { + return replaceMissing(loc.getLanguage(), "und", "Ω"); + } + + private String getScript(CLDRLocale loc) { + return loc.getScript(); } - private String fixUnd(String language) { - return "und".equals(language) ? "" : language; + private String getRegion(CLDRLocale loc) { + return loc.getCountry(); + } + + private String replaceMissing(String field, String ifEqual, String replacement) { + return ifEqual.equals(field) ? replacement : field; } }; @@ -457,12 +604,32 @@ private String fixUnd(String language) { public static String getNameSafe(String oldValue) { try { - return english.getName(oldValue); + if (oldValue != null) { + String result = english.getName(oldValue); + if (result.startsWith("Unknown language ")) { + result = result.substring("Unknown language ".length()); + } + return result; + } } catch (Exception e) { - return "n/a"; } + return "n/a"; + } + + enum OutputStyle { + PLAINTEXT, + C, + C_ALT, + XML } + private static OutputStyle OUTPUT_STYLE = + OutputStyle.valueOf(CldrUtility.getProperty("OutputStyle", "XML", "XML").toUpperCase()); + + private static final String TAG_SEPARATOR = OUTPUT_STYLE == OutputStyle.C_ALT ? "-" : "_"; + + private static final Joiner JOIN_SPACE = Joiner.on(' '); + private static Map generatePopulationData(Map toMaximized) { // we are going to try a different approach. // first gather counts for maximized values @@ -515,20 +682,17 @@ private static Map generatePopulationData(Map to // continue; } order *= UNOFFICIAL_SCALE_DOWN; - if (SHOW_ADD) + if (watching(SHOW_POP, writtenLanguage)) System.out.println( - "Retaining\t" - + writtenLanguage - + "\t" - + region - + "\t" - + getNameSafe(locale) - + "\t" - + number.format(literatePopulation) - + "\t" - + percent.format( - literatePopulation / literateTerritoryPopulation) - + (cldrLocales.contains(locale) ? "\tin-CLDR" : "")); + JOIN_TAB.join( + "Pop:", + writtenLanguage, + region, + getNameSafe(locale), + integer.format(literatePopulation), + percent.format( + literatePopulation / literateTerritoryPopulation), + cldrLocales.contains(locale) ? "CLDR Loc" : "")); } String script = localeToScriptCache.get(writtenLanguage); @@ -550,7 +714,22 @@ private static Map generatePopulationData(Map to } } if (!noPopulationData.isEmpty()) { + System.out.println("script data to add"); + Set stillBad = new TreeSet<>(); for (String lang : noPopulationData) { + LSRSource silLSR = silData.get(lang); + if (silLSR == null) { + stillBad.add(lang); + } else { + System.out.println( + " "); + } + } + for (String lang : stillBad) { System.out.println( JOIN_TAB.join("No script in pop. data for", lang, getNameSafe(lang))); } @@ -569,18 +748,21 @@ private static Map generatePopulationData(Map to } } - for (Entry> entry : - DeriveScripts.getLanguageToScript().asMap().entrySet()) { - String language = entry.getKey(); - final Collection values = entry.getValue(); - if (values.size() != 1) { - continue; // skip, no either way - } - Set> old = maxData.languages.get(language); - if (!maxData.languages.containsKey(language)) { - maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0); - } - } + // Old code for getting language to script, adding XZ, which converts to ZZ. Replaced by use + // of SIL data + + // for (Entry> entry : + // DeriveScripts.getLanguageToScript().asMap().entrySet()) { + // String language = entry.getKey(); + // final Collection values = entry.getValue(); + // if (values.size() != 1) { + // continue; // skip, no either way + // } + // Set> old = maxData.languages.get(language); + // if (!maxData.languages.containsKey(language)) { + // maxData.add(language, values.iterator().next(), TEMP_UNKNOWN_REGION, 1.0); + // } + // } // add others, with English default for (String region : otherTerritories) { @@ -642,8 +824,7 @@ private static Map generatePopulationData(Map to language + "_" + script + "_" + region, toMaximized, "L->SR", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); + LocaleOverride.REPLACE_EXISTING); } for (String language : maxData.languagesToScripts.keySet()) { String script = @@ -657,8 +838,7 @@ private static Map generatePopulationData(Map to language + "_" + script, toMaximized, "L->S", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); + LocaleOverride.REPLACE_EXISTING); } for (String language : maxData.languagesToRegions.keySet()) { String region = @@ -672,21 +852,19 @@ private static Map generatePopulationData(Map to language + "_" + region, toMaximized, "L->R", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); + LocaleOverride.REPLACE_EXISTING); } for (String script : maxData.scripts.keySet()) { R3 value = maxData.scripts.getAll(script).iterator().next(); - final Comparable language = value.get1(); - final Comparable region = value.get2(); + final String language = value.get1(); + final String region = value.get2(); add( "und_" + script, language + "_" + script + "_" + region, toMaximized, "S->LR", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); + LocaleOverride.REPLACE_EXISTING); } for (String script : maxData.scriptsToLanguages.keySet()) { String language = @@ -700,8 +878,7 @@ private static Map generatePopulationData(Map to language + "_" + script, toMaximized, "S->L", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); + LocaleOverride.REPLACE_EXISTING); } for (String script : maxData.scriptsToRegions.keySet()) { String region = @@ -715,21 +892,19 @@ private static Map generatePopulationData(Map to "und_" + script + "_" + region, toMaximized, "S->R", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); + LocaleOverride.REPLACE_EXISTING); } for (String region : maxData.regions.keySet()) { R3 value = maxData.regions.getAll(region).iterator().next(); - final Comparable language = value.get1(); - final Comparable script = value.get2(); + final String language = value.get1(); + final String script = value.get2(); add( "und_" + region, language + "_" + script + "_" + region, toMaximized, "R->LS", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); + LocaleOverride.REPLACE_EXISTING); } for (String region : maxData.regionsToLanguages.keySet()) { String language = @@ -743,8 +918,7 @@ private static Map generatePopulationData(Map to language + "_" + region, toMaximized, "R->L", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); + LocaleOverride.REPLACE_EXISTING); } for (String region : maxData.regionsToScripts.keySet()) { String script = @@ -758,110 +932,48 @@ private static Map generatePopulationData(Map to "und_" + script + "_" + region, toMaximized, "R->S", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); - } - - for (Entry>> containerAndInfo : - maxData.containersToLanguage.entrySet()) { - String region = containerAndInfo.getKey(); - if (region.equals("001")) { - continue; - } - Counter> data = containerAndInfo.getValue(); - Set> keysetSortedByCount = data.getKeysetSortedByCount(true); - if (SHOW_CONTAINERS) { // debug - System.out.println( - "Container2L:\t" - + region - + "\t" - + truncateLongString( - data.getEntrySetSortedByCount(true, null), 127)); - System.out.println( - "Container2LR:\t" - + region - + "\t" - + maxData.containersToLangRegion.get(region)); - } - R2 value = - keysetSortedByCount.iterator().next(); // will get most negative - final Comparable language = value.get0(); - final Comparable script = value.get1(); - - // fix special cases like es-419, where a locale exists. - // for those cases, what we add as output is the container. Otherwise the region. - Set skipLanguages = cldrContainerToLanguages.get(region); - if (skipLanguages != null && skipLanguages.contains(language)) { - add( - "und_" + region, - language + "_" + script + "_" + region, - toMaximized, - "R*->LS", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); - continue; - } - - // we now have the best language and script. Find the best region for that - for (R4 e : - maxData.containersToLangRegion.get(region)) { - final Comparable language2 = e.get1(); - final Comparable script2 = e.get2(); - if (language2.equals(language) && script2.equals(script)) { - add( - "und_" + region, - language + "_" + script + "_" + e.get3(), - toMaximized, - "R*->LS", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); - break; - } - } + LocaleOverride.REPLACE_EXISTING); } for (R2 languageScript : maxData.languageScripts.keySet()) { R2 value = maxData.languageScripts.getAll(languageScript).iterator().next(); - final Comparable language = languageScript.get0(); - final Comparable script = languageScript.get1(); - final Comparable region = value.get1(); + final String language = languageScript.get0(); + final String script = languageScript.get1(); + final String region = value.get1(); add( language + "_" + script, language + "_" + script + "_" + region, toMaximized, "LS->R", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); + LocaleOverride.REPLACE_EXISTING); } for (R2 scriptRegion : maxData.scriptRegions.keySet()) { R2 value = maxData.scriptRegions.getAll(scriptRegion).iterator().next(); - final Comparable script = scriptRegion.get0(); - final Comparable region = scriptRegion.get1(); - final Comparable language = value.get1(); + final String script = scriptRegion.get0(); + final String region = scriptRegion.get1(); + final String language = value.get1(); add( "und_" + script + "_" + region, language + "_" + script + "_" + region, toMaximized, "SR->L", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); + LocaleOverride.REPLACE_EXISTING); } for (R2 languageRegion : maxData.languageRegions.keySet()) { R2 value = maxData.languageRegions.getAll(languageRegion).iterator().next(); - final Comparable language = languageRegion.get0(); - final Comparable region = languageRegion.get1(); - final Comparable script = value.get1(); + final String language = languageRegion.get0(); + final String region = languageRegion.get1(); + final String script = value.get1(); add( language + "_" + region, language + "_" + script + "_" + region, toMaximized, "LR->S", - LocaleOverride.REPLACE_EXISTING, - SHOW_ADD); + LocaleOverride.REPLACE_EXISTING); } // get the script info from metadata as fallback @@ -875,31 +987,15 @@ private static Map generatePopulationData(Map to } String originCountry = i.originCountry; final String result = likelyLanguage + "_" + script + "_" + originCountry; - add( - "und_" + script, - result, - toMaximized, - "S->LR•", - LocaleOverride.KEEP_EXISTING, - SHOW_ADD); - add( - likelyLanguage, - result, - toMaximized, - "L->SR•", - LocaleOverride.KEEP_EXISTING, - SHOW_ADD); + add("und_" + script, result, toMaximized, "S->LR•", LocaleOverride.KEEP_EXISTING); + add(likelyLanguage, result, toMaximized, "L->SR•", LocaleOverride.KEEP_EXISTING); } // add overrides - for (String key : LANGUAGE_OVERRIDES.keySet()) { - add( - key, - LANGUAGE_OVERRIDES.get(key), - toMaximized, - "OVERRIDE", - LocaleOverride.REPLACE_EXISTING, - true); + for (Entry entry : LANGUAGE_OVERRIDES.entrySet()) { + String source = entry.getKey(); + String target = entry.getValue(); + add(source, target, toMaximized, "OVERRIDE", LocaleOverride.REPLACE_EXISTING); } // Make sure that the mapping is Idempotent. If we have A ==> B, we must never have B ==> C @@ -939,6 +1035,7 @@ private static Map generatePopulationData(Map to return toMaximized; } + /** Class for maximizing data sources */ public static class MaxData { Relation> languages = Relation.of( @@ -958,12 +1055,6 @@ public static class MaxData { Map> regionsToLanguages = new TreeMap<>(); Map> regionsToScripts = new TreeMap<>(); - Map>> containersToLanguage = new TreeMap<>(); - Relation> containersToLangRegion = - Relation.of( - new TreeMap>>(), - TreeSet.class); - Relation, Row.R2> languageScripts = Relation.of( new TreeMap, Set>>(), @@ -987,9 +1078,11 @@ public static class MaxData { * @param order */ void add(String language, String script, String region, Double order) { - if (SHOW_ADD && language.equals(LocaleNames.MIS)) { - System.out.println(language + "\t" + script + "\t" + region + "\t" + -order); - } + if (watching(SHOW_ORDER, language)) + System.out.println( + JOIN_TAB.join( + "Add Data:", language, script, region, integer.format(order))); + languages.put(language, Row.of(order, script, region)); // addCounter(languagesToScripts, language, script, order); // addCounter(languagesToRegions, language, region, order); @@ -1005,23 +1098,6 @@ void add(String language, String script, String region, Double order) { languageScripts.put(Row.of(language, script), Row.of(order, region)); scriptRegions.put(Row.of(script, region), Row.of(order, language)); languageRegions.put(Row.of(language, region), Row.of(order, script)); - - Set containerSet = Containment.leafToContainer(region); - if (containerSet != null) { - for (String container : containerSet) { - - containersToLangRegion.put(container, Row.of(order, language, script, region)); - Counter> data = containersToLanguage.get(container); - if (data == null) { - containersToLanguage.put(container, data = new Counter<>()); - } - data.add(Row.of(language, script), (long) (double) order); - } - } - - if (SHOW_ADD) - System.out.println( - "Data:\t" + language + "\t" + script + "\t" + region + "\t" + order); } // private void addCounter(Map> map, String key, String key2, Double // count) { @@ -1041,8 +1117,13 @@ private static long getWritingPopulation(PopulationData popData) { return (long) popData.getLiteratePopulation(); } - private static String getName(String value) { - return ConvertLanguageData.getLanguageCodeAndName(value); + private static void add( + String key, + String value, + Map toAdd, + String kind, + LocaleOverride override) { + add(key, value, toAdd, kind, override, SHOW_ADD); } private static void add( @@ -1052,39 +1133,42 @@ private static void add( String kind, LocaleOverride override, boolean showAction) { - if (SHOW_ADD && key.startsWith(LocaleNames.MIS)) { - int debug = 1; - } - if (key.equals(DEBUG_ADD_KEY)) { - System.out.println("*debug*"); - } String oldValue = toAdd.get(key); if (oldValue == null) { - if (showAction) { + if (watching(showAction, key, value)) { System.out.println( - "\tAdding:\t\t" - + getName(key) - + "\t=>\t" - + getName(value) - + "\t\t\t\t" - + kind); + JOIN_TAB.join( + "", + "Adding:", + key, + getNameSafe(key), + "→", + value, + getNameSafe(value), + "", + "", + kind)); } } else if (override == LocaleOverride.KEEP_EXISTING || value.equals(oldValue)) { // if (showAction) { - // System.out.println("Skipping:\t" + key + "\t=>\t" + value + "\t\t\t\t" + kind); + // System.out.println("Skipping:\t" + key + "\t→\t" + value + "\t\t\t\t" + kind); // } return; } else { - if (showAction) { + if (watching(showAction, key, value)) { System.out.println( - "\tReplacing:\t" - + getName(key) - + "\t=>\t" - + getName(value) - + "\t, was\t" - + getName(oldValue) - + "\t\t" - + kind); + JOIN_TAB.join( + "", + "Replacing:", + key, + getNameSafe(key), + "→", + value, + getNameSafe(value), + ", was", + oldValue, + getNameSafe(oldValue), + kind)); } } toAdd.put(key, value); @@ -1093,54 +1177,41 @@ private static void add( public static String truncateLongString(Object data, int maxLen) { String info = data.toString(); if (info.length() > maxLen) { + if (UCharacter.codePointAt(info, maxLen - 1) > 0xFFFF) { + maxLen--; + } info = info.substring(0, maxLen) + "…"; - // TODO, handle supplemental characters. } return info; } - public static Map minimize(Map fluffup) { + public static Map minimize( + Map fluffup, Map itemsRemoved) { LanguageTagParser parser = new LanguageTagParser(); LanguageTagParser targetParser = new LanguageTagParser(); - Set removals = new TreeSet<>(); - while (true) { + Map removals = new TreeMap<>(); + for (int pass = 0; ; ++pass) { removals.clear(); - for (String locale : fluffup.keySet()) { - String target = fluffup.get(locale); + for (Entry entry : fluffup.entrySet()) { + String locale = entry.getKey(); + String target = entry.getValue(); + if (targetParser.set(target).getRegion().equals(LocaleScriptInfo.UNKNOWN_REGION)) { - removals.add(locale); - if (SHOW_ADD) - System.out.println( - "Removing:\t" - + getName(locale) - + "\t=>\t" - + getName(target) - + "\t\t - Unknown Region in target"); + removals.put(locale, target); + showRemoving(pass, locale, target, "Unknown Region in target"); continue; } if (targetParser.getScript().equals(LocaleScriptInfo.UNKNOWN_SCRIPT)) { - removals.add(locale); - if (SHOW_ADD) - System.out.println( - "Removing:\t" - + getName(locale) - + "\t=>\t" - + getName(target) - + "\t\t - Unknown Script in target"); + removals.put(locale, target); + showRemoving(pass, locale, target, "Unknown Script in target"); continue; } String region = parser.set(locale).getRegion(); if (region.length() != 0) { if (region.equals(LocaleScriptInfo.UNKNOWN_REGION)) { - removals.add(locale); - if (SHOW_ADD) - System.out.println( - "Removing:\t" - + getName(locale) - + "\t=>\t" - + getName(target) - + "\t\t - Unknown Region in source"); + removals.put(locale, target); + showRemoving(pass, locale, target, "Unknown Region in source"); continue; } parser.setRegion(""); @@ -1149,15 +1220,8 @@ public static Map minimize(Map fluffup) { if (newTarget != null) { newTarget = targetParser.set(newTarget).setRegion(region).toString(); if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { - removals.add(locale); - if (SHOW_ADD) - System.out.println( - "Removing:\t" - + locale - + "\t=>\t" - + target - + "\t\tRedundant with " - + newLocale); + removals.put(locale, target); + showRemoving(pass, locale, target, "Redundant with\t" + newLocale); continue; } } @@ -1168,14 +1232,8 @@ public static Map minimize(Map fluffup) { } if (script.length() != 0) { if (script.equals(LocaleScriptInfo.UNKNOWN_SCRIPT)) { - removals.add(locale); - if (SHOW_ADD) - System.out.println( - "Removing:\t" - + locale - + "\t=>\t" - + target - + "\t\t - Unknown Script"); + removals.put(locale, target); + showRemoving(pass, locale, target, "Unknown Script"); continue; } parser.setScript(""); @@ -1184,15 +1242,8 @@ public static Map minimize(Map fluffup) { if (newTarget != null) { newTarget = targetParser.set(newTarget).setScript(script).toString(); if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { - removals.add(locale); - if (SHOW_ADD) - System.out.println( - "Removing:\t" - + locale - + "\t=>\t" - + target - + "\t\tRedundant with " - + newLocale); + removals.put(locale, target); + showRemoving(pass, locale, target, "Redundant with\t" + newLocale); continue; } } @@ -1201,10 +1252,152 @@ public static Map minimize(Map fluffup) { if (removals.size() == 0) { break; } - for (String locale : removals) { + itemsRemoved.putAll(removals); + for (String locale : removals.keySet()) { fluffup.remove(locale); } } return fluffup; } + + public static void showRemoving( + Object pass, String locale, String target, final String reason) { + if (watching(SHOW_MIN, target)) { + System.out.println(JOIN_TAB.join(pass, "Removing:", locale, "→", target, "", reason)); + } + } + + public static String printingName(String locale, Joiner spacing) { + if (locale == null) { + return null; + } + CLDRLocale cLocale = CLDRLocale.getInstance(locale); + String lang = cLocale.getLanguage(); + String script = cLocale.getScript(); + String region = cLocale.getCountry(); + return spacing.join( + (lang.equals(LocaleNames.UND) + ? "?" + : english.getName(CLDRFile.LANGUAGE_NAME, lang)), + (script == null || script.equals("") + ? "?" + : english.getName(CLDRFile.SCRIPT_NAME, script)), + (region == null || region.equals("") + ? "?" + : english.getName(CLDRFile.TERRITORY_NAME, region))); + } + + private static File printLikelySubtags(Map fluffup) throws IOException { + final File genDir = new File(CLDRPaths.GEN_DIRECTORY, "supplemental"); + final File genFile = + new File( + genDir, + "likelySubtags" + (OUTPUT_STYLE == OutputStyle.XML ? ".xml" : ".txt")); + System.out.println("Writing to " + genFile); + + // set based on above + final String SEPARATOR = + OUTPUT_STYLE == OutputStyle.C || OUTPUT_STYLE == OutputStyle.C_ALT + ? CldrUtility.LINE_SEPARATOR + : "\t"; + Joiner spacing = + Joiner.on(OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t" : " ‧ ").useForNull("∅"); + + final String arrow = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t⇒\t" : "\t➡ "; + + try (PrintWriter out = FileUtilities.openUTF8Writer(genFile)) { + String header = + OUTPUT_STYLE != OutputStyle.XML + ? "const MapToMaximalSubtags default_subtags[] = {" + : "" + + CldrUtility.LINE_SEPARATOR + + "" + + CldrUtility.LINE_SEPARATOR + + "" + + CldrUtility.LINE_SEPARATOR + + "" + + CldrUtility.LINE_SEPARATOR + + "" + + CldrUtility.LINE_SEPARATOR + + " " + + CldrUtility.LINE_SEPARATOR + + " "; + String footer = + OUTPUT_STYLE != OutputStyle.XML + ? SEPARATOR + "};" + : " " + + CldrUtility.LINE_SEPARATOR + + ""; + out.println(header); + boolean first = true; + Set keys = new TreeSet<>(new LocaleStringComparator()); + keys.addAll(fluffup.keySet()); + for (String printingLocale : keys) { + String printingTarget = fluffup.get(printingLocale); + String comment = + printingName(printingLocale, spacing) + + arrow + + printingName(printingTarget, spacing); + + if (OUTPUT_STYLE == OutputStyle.XML) { + out.println( + "\t\t" + + "\t\t" + + ""); + } else { + if (first) { + first = false; + } else { + out.print(","); + } + if (comment.length() > 70 && SEPARATOR.equals(CldrUtility.LINE_SEPARATOR)) { + comment = + printingName(printingLocale, spacing) + + SEPARATOR + + " // " + + arrow + + printingName(printingTarget, spacing); + } + out.print( + " {" + + SEPARATOR + + " // " + + comment + + SEPARATOR + + " \"" + + printingLocale + + "\"," + + SEPARATOR + + " \"" + + printingTarget + + "\"" + + CldrUtility.LINE_SEPARATOR + + " }"); + } + } + out.println(footer); + out.close(); + } + return genFile; + } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/LangTagsData.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/LangTagsData.java new file mode 100644 index 00000000000..7ddfbbd863f --- /dev/null +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/LangTagsData.java @@ -0,0 +1,355 @@ +package org.unicode.cldr.tool; + +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Multimap; +import com.google.common.collect.TreeMultimap; +import com.ibm.icu.impl.Row; +import com.ibm.icu.util.Output; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.unicode.cldr.util.CLDRConfig; +import org.unicode.cldr.util.CLDRFile; +import org.unicode.cldr.util.CLDRPaths; +import org.unicode.cldr.util.Iso639Data; +import org.unicode.cldr.util.Iso639Data.Type; +import org.unicode.cldr.util.LanguageTagParser; +import org.unicode.cldr.util.StandardCodes.LstrType; +import org.unicode.cldr.util.Validity; +import org.unicode.cldr.util.Validity.Status; + +public class LangTagsData { + private final Pattern fullTagMatch = Pattern.compile("\\s*\"(full|tag)\": \"([^\"]+)\","); + private final String SIL = "sil1"; + + private final Splitter TAB_SPLITTER = Splitter.on('\t'); + private final Set LIKELY_SPECIALS = ImmutableSet.of("in", "iw", "ji", "jw", "mo"); + private final Set FIX_VALIDITY = ImmutableSet.of("Zanb"); + private final Set FIX_COUNTRY = ImmutableSet.of("yi"); + private final Validity validity = Validity.getInstance(); + + private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); + private static final CLDRFile english = CLDR_CONFIG.getEnglish(); + + private static final LangTagsData INSTANCE = new LangTagsData(); + + private final Multimap wikiData; + private final Map jsonData; + private final Errors processErrors = new Errors(); + + private LangTagsData() { + wikiData = readWikidata(); + jsonData = readJson(); + } + + public static LangTagsData getInstance() { + return INSTANCE; + } + + public static Multimap getWikiData() { + return getInstance().wikiData; + } + + public static Map getJsonData() { + return getInstance().jsonData; + } + + public static Errors getProcessErrors() { + return getInstance().processErrors; + } + + private Map readJson() { + + final LanguageTagParser ltpFull = new LanguageTagParser(); + final LanguageTagParser ltpTag = new LanguageTagParser(); + + Path path = Paths.get(CLDRPaths.BIRTH_DATA_DIR, "/../external/langtags.json"); + if (!Files.exists(path)) { + throw new IllegalArgumentException(path + " does not exist"); + } + + Matcher full = fullTagMatch.matcher(""); + Map errors = new TreeMap<>(); + + Output lastFull = new Output<>(); + Map result = new TreeMap<>(); + try { + Files.lines(path) + .forEach( + x -> { + if (full.reset(x).matches()) { + final String key = full.group(1); + final String value = full.group(2).replace("-", "_"); + if (value.startsWith("aai")) { + int debug = 0; + } + switch (key) { + case "full": + lastFull.value = value; + break; + case "tag": + try { + String fullLang = + ltpFull.set(lastFull.value).getLanguage(); + if (isIllFormed(lastFull.value, ltpFull) + || isIllFormed(value, ltpTag.set(value))) { + processErrors.put( + Errors.Type.ill_formed_tags, + value, + lastFull.value, + ""); + } else { + String reference = SIL; + final String fullScript = ltpFull.getScript(); + String fullRegion = ltpFull.getRegion(); + if (fullRegion.equals("ZZ") + || fullRegion.equals("001")) { + Collection tempRegions = + wikiData.get( + fullLang); // synthesize + if (!tempRegions.isEmpty()) { + fullRegion = + tempRegions.iterator().next(); + reference += " wikidata"; + } + } + + String tagLang = ltpTag.getLanguage(); + String tagScript = ltpTag.getScript(); + String tagRegion = ltpTag.getRegion(); + + if (!tagLang.equals(fullLang) + || (!tagScript.isEmpty() + && !tagScript.equals( + fullScript)) + || (!tagRegion.isEmpty() + && !tagRegion.equals( + fullRegion))) { + processErrors.put( + Errors.Type.tag_not_in_full, + value, + lastFull.value, + ""); + } else { + if (isOk( + fullLang, + fullScript, + fullRegion, + errors)) { + add( + result, + value, + fullLang, + fullScript, + fullRegion, + reference); + } else { + processErrors.put( + Errors.Type.skipping_scope, + value, + ltpFull.toString(), + errors.toString()); + } + } + } + } catch (Exception e) { + processErrors.put( + Errors.Type.exception, + value, + lastFull.value, + e.getMessage()); + } + break; + default: + throw new IllegalArgumentException(); // never happens + } + } + }); + return result; + } catch (IOException ex) { + throw new UncheckedIOException(ex); + } + } + + private boolean isIllFormed(String source, LanguageTagParser languageTagParser) { + return languageTagParser.getLanguage().isEmpty() + || !languageTagParser.getVariants().isEmpty() + || !languageTagParser.getExtensions().isEmpty() + || !languageTagParser.getLocaleExtensions().isEmpty() + || source.contains("@"); + } + + private boolean isOk(String lang, String script, String region, Map errors) { + errors.clear(); + if (!LIKELY_SPECIALS.contains(lang)) { + check(LstrType.language, lang, errors); + } + if (!FIX_VALIDITY.contains(script)) { + check(LstrType.script, script, errors); + } + if (region.equals("001") && Iso639Data.getType(lang) == Type.Constructed) { + // ok + } else { + check(LstrType.region, region, errors); + } + return errors.isEmpty(); + } + + private void check(LstrType lstrType, String lang, Map errors) { + final Status status = validity.getCodeToStatus(lstrType).get(lang); + if (status != Status.regular) { + errors.put(lstrType, status); + } + } + + private Multimap readWikidata() { + Multimap result = TreeMultimap.create(); + Path path = + Paths.get(CLDRPaths.BIRTH_DATA_DIR, "/../external/wididata_lang_region.tsv") + .normalize(); + if (!Files.exists(path)) { + throw new IllegalArgumentException(path + " does not exist"); + } + try { + Files.lines(path) + .forEach( + x -> { + if (!x.startsWith("#")) { + List list = TAB_SPLITTER.splitToList(x); + String lang = list.get(1); + String region = list.get(3); + result.put(lang, region); + } + }); + } catch (IOException ex) { + throw new UncheckedIOException(ex); + } + return result; + } + + private void add( + Map result, + String source, + String lang, + final String script, + final String region, + String reference) { + LSRSource old = result.get(source); + LSRSource newVersion = new LSRSource(lang, script, region, reference); + if (old != null && !old.equals(newVersion)) { + throw new IllegalArgumentException( + "Data already exists for " + source + ": old=" + old + ", new: " + newVersion); + } + result.put(source, newVersion); + } + + private static class Errors { + public enum Type { + ill_formed_tags("Ill-formed tags"), + already_CLDR("Language already in CLDR"), + tag_not_in_full("tag ⊄ full"), + exception("exception"), + skipping_scope("Skipping scope, SIL"); + + private final String printable; + + private Type(String printable) { + this.printable = printable; + } + } + + public Multimap data = TreeMultimap.create(); + + public void put( + Type illFormedTags, String tagValue, String fullValue, String errorMessage) { + data.put( + illFormedTags, + tagValue + + " ➡ " + + fullValue + + (errorMessage == null || errorMessage.isEmpty() + ? "" + : "\t—\t" + errorMessage)); + } + + public void printAll() { + for (Entry> entry : data.asMap().entrySet()) { + Type type = entry.getKey(); + System.out.println(); + for (String message : entry.getValue()) { + System.out.println(type + "\t" + message); + } + } + } + } + + static class LSRSource implements Comparable { + final Row.R4 data; + + LSRSource(String lang, String script, String region, String source) { + if (script.contains("Soyo") || region.contains("Soyo")) { + int debug = 0; + } + data = Row.of(lang, script, region, source); + data.freeze(); + } + + @Override + public String toString() { + return combineLSR(data.get0(), data.get1(), data.get2()) + " // " + data.get3(); + } + + @Override + public int compareTo(LSRSource o) { + return data.compareTo(o.data); + } + + @Override + public int hashCode() { + return data.hashCode(); + } + + @Override + public boolean equals(Object obj) { + return data.equals(obj); + } + + public String line(String source) { + // TODO Auto-generated method stub + // + // + final String target = combineLSR(data.get0(), data.get1(), data.get2()); + final String origin = data.get3(); + final String result = + "" + + "\t"; + return result; + } + + public static String combineLSR(String lang, String script, String region) { + return lang + + (script.isEmpty() ? "" : "_" + script) + + (region.isEmpty() ? "" : "_" + region); + } + } +}