diff --git a/common/properties/scriptMetadata.txt b/common/properties/scriptMetadata.txt index 0ac8ab4bc79..9dee6c4680c 100644 --- a/common/properties/scriptMetadata.txt +++ b/common/properties/scriptMetadata.txt @@ -180,7 +180,7 @@ Onao; 33; 1E5D0; IN; 1; EXCLUSION; NO; NO; MIN; NO; NO # provisional data for f Orkh; 33; 10C00; MN; 1; EXCLUSION; YES; NO; NO; NO; NO Osge; 33; 104B5; US; 1; LIMITED_USE; NO; NO; NO; NO; YES Osma; 33; 10480; SO; 1; EXCLUSION; NO; NO; NO; NO; NO -Ougr; 33; 10F7C; 143; 1; EXCLUSION; YES; NO; YES; NO; NO +Ougr; 33; 10F7C; CN; 1; EXCLUSION; YES; NO; YES; NO; NO Palm; 33; 10873; SY; 1; EXCLUSION; YES; NO; NO; NO; NO Pauc; 33; 11AC0; MM; 1; EXCLUSION; NO; NO; NO; NO; NO Perm; 33; 1036B; RU; 1; EXCLUSION; NO; NO; NO; NO; NO diff --git a/common/supplemental/likelySubtags.xml b/common/supplemental/likelySubtags.xml index b22f2c88f7c..f595591b068 100644 --- a/common/supplemental/likelySubtags.xml +++ b/common/supplemental/likelySubtags.xml @@ -8,9489 +8,7757 @@ CLDR data files are interpreted according to the LDML specification (http://unicdiff --git a/common/supplemental/supplementalData.xml b/common/supplemental/supplementalData.xml index ac6f516ee3f..7ddde428e6c 100644 --- a/common/supplemental/supplementalData.xml +++ b/common/supplemental/supplementalData.xml @@ -4226,7 +4226,7 @@ XXX Code for transations where no currency is involved - + diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ScriptMetadata.java b/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ScriptMetadata.java index f4ec6489ca0..f7f56941980 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ScriptMetadata.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/draft/ScriptMetadata.java @@ -23,6 +23,9 @@ import org.unicode.cldr.util.Containment; import org.unicode.cldr.util.SemiFileReader; import org.unicode.cldr.util.StandardCodes; +import org.unicode.cldr.util.StandardCodes.LstrType; +import org.unicode.cldr.util.Validity; +import org.unicode.cldr.util.Validity.Status; import org.unicode.cldr.util.With; public class ScriptMetadata { @@ -139,6 +142,12 @@ public static void addNameToCode(String type, Map hashMap) { public static final class SkipNewUnicodeException extends ICUException {} + /** + * Scripts that either have no known languages as yet (Cpmn) or are used for any language + * (Brai). + */ + public static final Set SCRIPTS_WITH_NO_LANGUAGES = Set.of("Brai", "Cpmn"); + public static class Info implements Comparable { public final int rank; public final VersionInfo age; @@ -173,6 +182,7 @@ private Info(String[] items) { ime = trinaryLookup.forString(Column.IME.getItem(items)); hasCase = trinaryLookup.forString(Column.HAS_CASE.getItem(items)); density = Column.DENSITY.getInt(items, -1); + String script = items[2]; final String countryRaw = Column.ORIGIN_COUNTRY.getItem(items); String country = CountryCodeConverter.getCodeFromName(countryRaw, false); @@ -191,6 +201,39 @@ private Info(String[] items) { langCode = null; } likelyLanguage = langCode == null ? "und" : langCode; + + // check for bad countries, bad languages + + final Status scriptStatus = + Validity.getInstance().getCodeToStatus(LstrType.script).get(script); + if (!(scriptStatus == Status.special || scriptStatus == Status.unknown)) { + final Status countryStatus = + Validity.getInstance().getCodeToStatus(LstrType.region).get(originCountry); + if (countryStatus != Status.regular) { + errors.add( + "ScriptMetadata.java: the country (" + + originCountry + + ") for " + + script + + " is not valid: " + + countryStatus); + } + final Status languageStatus = + Validity.getInstance() + .getCodeToStatus(LstrType.language) + .get(likelyLanguage); + if (languageStatus != Status.regular + // make exception for scripts that has no known languages + && !SCRIPTS_WITH_NO_LANGUAGES.contains(script)) { + errors.add( + "ScriptMetadata.java: the likely language (" + + likelyLanguage + + ") for " + + script + + " is not valid: " + + languageStatus); + } + } } public Info(Info other, String string, String sampleCharacter) { diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java index 492bcadaad4..73cb1248901 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java @@ -6,6 +6,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; import com.ibm.icu.impl.Relation; import com.ibm.icu.impl.Row; import com.ibm.icu.impl.Row.R2; @@ -13,12 +14,14 @@ import com.ibm.icu.impl.locale.XCldrStub.Splitter; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.util.Output; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -44,6 +47,7 @@ import org.unicode.cldr.util.LanguageTagParser; import org.unicode.cldr.util.LocaleNames; import org.unicode.cldr.util.LocaleScriptInfo; +import org.unicode.cldr.util.LocaleValidator; import org.unicode.cldr.util.SimpleFactory; import org.unicode.cldr.util.StandardCodes; import org.unicode.cldr.util.StandardCodes.LstrType; @@ -65,6 +69,8 @@ public class GenerateLikelySubtags { private static final Map LANGUAGE_CODE_TO_STATUS = Validity.getInstance().getCodeToStatus(LstrType.language); + private static final Map SCRIPT_CODE_TO_STATUS = + Validity.getInstance().getCodeToStatus(LstrType.script); private static final String TEMP_UNKNOWN_REGION = "XZ"; @@ -96,7 +102,7 @@ public class GenerateLikelySubtags { private static boolean DROP_HARDCODED = false; private enum MyOptions { - minimize(new Params().setHelp("Show minimization actions")), + minimize(new Params().setHelp("Show minimization actions (")), add(new Params().setHelp("Show additions")), population(new Params().setHelp("Show population data used")), order(new Params().setHelp("Show the priority order for langauge data")), @@ -141,6 +147,7 @@ private static Set parse(String[] args) { static final Map silData = LangTagsData.getJsonData(); public static void main(String[] args) throws IOException { + System.out.println("Use options to track progress, -w to narrow to specific subtags."); MyOptions.parse(args); SHOW_ADD = MyOptions.add.option.doesOccur(); SHOW_MIN = MyOptions.minimize.option.doesOccur(); @@ -161,50 +168,77 @@ public static void main(String[] args) throws IOException { jsonErrors.printAll(); } - Map old = supplementalData.getLikelySubtags(); Map oldOrigins = supplementalData.getLikelyOrigins(); System.out.println("origins: " + new TreeSet<>(oldOrigins.values())); - Map toMaximized = generatePopulationData(new TreeMap<>(LOCALE_SOURCE)); + Map baseMappings = generatePopulationData(new TreeMap<>(LOCALE_SOURCE)); + System.out.println(JOIN_TAB.join("\nBase data:", baseMappings.size())); Map itemsRemoved = new TreeMap<>(); - Map result = minimize(toMaximized, itemsRemoved); + Map minimizedMappings = minimize(baseMappings, itemsRemoved); + System.out.println(JOIN_TAB.join("\nMinimized:", minimizedMappings.size())); + + // Verify that the minimized version produces the same results + + LikelySubtags max = new LikelySubtags(baseMappings); + LikelySubtags min = new LikelySubtags(minimizedMappings); + + Map minFailures = new TreeMap<>(LOCALE_SOURCE); + int failures = 0; + System.out.println( + "\nVerifying that Minimizing doesn't change function\n" + + JOIN_TAB.join("status, source, maxTarg, minTarg".split(", "))); + for (String source : baseMappings.keySet()) { + String orgTarg = max.maximize(source); + String minTarg = min.maximize(source); + if (!orgTarg.equals(minTarg)) { + minFailures.put(source, orgTarg); + System.out.println(JOIN_TAB.join("Fail", source, orgTarg, minTarg)); + failures++; + } else { + if (watching(SHOW_MIN, source, orgTarg, minTarg)) { + System.out.println(JOIN_TAB.join("Watch", source, orgTarg, minTarg)); + } + } + } + if (failures != 0) { + throw new IllegalArgumentException(); + } - Set newAdditions = new TreeSet(); - Set newMissing = new TreeSet(); + Set newAdditions = new TreeSet<>(); + Set newMissing = new TreeSet<>(); // Check against last version - System.out.println(JOIN_TAB.join("Source", "Name", "oldValue", "Name", "newValue", "Name")); + System.out.println("\nReading old supplemental: may have unrelated errors."); + final SupplementalDataInfo oldSupplementalInfo = + SupplementalDataInfo.getInstance( + CldrUtility.getPath(CLDRPaths.LAST_COMMON_DIRECTORY, "supplemental/")); + final Map oldLikelyData = oldSupplementalInfo.getLikelySubtags(); + final Map oldLikelyOrigins = oldSupplementalInfo.getLikelyOrigins(); + LikelySubtags oldLikely = new LikelySubtags(oldLikelyData); Set sorted = new TreeSet<>(LOCALE_SOURCE); - sorted.addAll(result.keySet()); - sorted.addAll(old.keySet()); + sorted.addAll(minimizedMappings.keySet()); + sorted.addAll(oldLikelyData.keySet()); - for (String source : sorted) { - String oldValue = old.get(source); - String newValue = result.get(source); - String removal = itemsRemoved.get(source); + System.out.println( + "\nCheck against last version\n" + + JOIN_TAB.join("Source", "Name", "oldValue", "Name", "newValue", "Name")); - if (newValue == null) { - LSRSource silValue = silData.get(source); - if (silValue != null) { - newValue = silValue.getLsrString(); - } + for (String source : sorted) { + String oldValue = oldLikely.maximize(source); + String oldOrigin = oldLikelyOrigins.get(source); + if (oldOrigin != null && oldOrigin.contains("sil1")) { + continue; // we don't control variations in sil data } + String newValue = min.maximize(source); + String removal = itemsRemoved.get(source); if (Objects.equal(oldValue, newValue)) { continue; } - - // SKIP the sil values; those will be recreated - - final String origins = oldOrigins.get(source); - if (origins != null && origins.contains("sil1")) { - continue; // skip for now - } - // skip new values, or oldValues that are specifically removed if (oldValue == null || oldValue.equals(removal)) { @@ -231,7 +265,7 @@ public static void main(String[] args) throws IOException { } System.out.println("new missing\t" + newMissing); - printLikelySubtags(result); + printLikelySubtags(minimizedMappings); } static { @@ -247,7 +281,7 @@ public static void main(String[] args) throws IOException { } private static final List KEEP_TARGETS = - DROP_HARDCODED ? List.of() : List.of("und_Arab_PK", "und_Latn_ET", "hi_Latn"); + DROP_HARDCODED ? List.of() : List.of("und_Arab_PK", "und_Latn_ET"); private static final ImmutableSet deprecatedISONotInLST = DROP_HARDCODED ? ImmutableSet.of() : ImmutableSet.of("scc", "scr"); @@ -295,7 +329,6 @@ public static void main(String[] args) throws IOException { "ojs_Cans_CA", "oka_Latn_CA", "pqm_Latn_CA", - "hi_Latn_IN", "no_Latn_NO", "tok_Latn_001", "prg_Latn_PL", @@ -306,7 +339,9 @@ public static void main(String[] args) throws IOException { * results. Safer is to add to MAX_ADDITIONS. However, if you add, add both the language and * language+script mappings. */ + // Many of the overrides below can be removed once the language/pop/country data is updated. + private static final Map LANGUAGE_OVERRIDES = CldrUtility.asMap( DROP_HARDCODED @@ -361,7 +396,7 @@ public static void main(String[] args) throws IOException { {"sr_Latn", "sr_Latn_RS"}, {"ss", "ss_Latn_ZA"}, {"ss_Latn", "ss_Latn_ZA"}, - {"swc", "swc_Latn_CD"}, + // {"swc", "swc_Latn_CD"}, {"ti", "ti_Ethi_ET"}, {"ti_Ethi", "ti_Ethi_ET"}, {LocaleNames.UND, "en_Latn_US"}, @@ -371,7 +406,6 @@ public static void main(String[] args) throws IOException { {"und_Arab_PK", "ur_Arab_PK"}, {"und_Bopo", "zh_Bopo_TW"}, {"und_Deva_FJ", "hif_Deva_FJ"}, - {"und_EZ", "de_Latn_EZ"}, {"und_Hani", "zh_Hani_CN"}, {"und_Hani_CN", "zh_Hani_CN"}, {"und_Kana", "ja_Kana_JP"}, @@ -393,8 +427,6 @@ public static void main(String[] args) throws IOException { {"und_SO", "so_Latn_SO"}, {"und_SS", "en_Latn_SS"}, {"und_TK", "tkl_Latn_TK"}, - {"und_UN", "en_Latn_UN"}, - {"und_005", "pt_Latn_BR"}, {"vo", "vo_Latn_001"}, {"vo_Latn", "vo_Latn_001"}, // {"yi", "yi_Hebr_001"}, @@ -441,7 +473,6 @@ public static void main(String[] args) throws IOException { // { "mis_Medf", "mis_Medf_NG" }, {"ku_Yezi", "ku_Yezi_GE"}, - {"und_EU", "en_Latn_IE"}, {"hnj", "hnj_Hmnp_US"}, // preferred lang/script in CLDR {"hnj_Hmnp", "hnj_Hmnp_US"}, {"und_Hmnp", "hnj_Hmnp_US"}, @@ -461,6 +492,25 @@ public static void main(String[] args) throws IOException { {"und_CC", "ms_Arab_CC"}, {"und_SL", "kri_Latn_SL"}, {"und_SS", "ar_Arab_SS"}, + + // additions for missing values from LikelySubtagsText + {"und_Arab_AF", "fa_Arab_AF"}, + {"und_Cyrl_BG", "bg_Cyrl_BG"}, + {"und_Tibt_BT", "dz_Tibt_BT"}, + {"und_Cyrl_BY", "be_Cyrl_BY"}, + {"und_Arab_CC", "ms_Arab_CC"}, + {"und_Ethi_ER", "ti_Ethi_ER"}, + {"und_Arab_IR", "fa_Arab_IR"}, + {"und_Cyrl_KG", "ky_Cyrl_KG"}, + {"und_Cyrl_MK", "mk_Cyrl_MK"}, + {"und_Cyrl_MN", "mn_Cyrl_MN"}, + {"und_Deva_NP", "ne_Deva_NP"}, + {"und_Cyrl_RS", "sr_Cyrl_RS"}, + {"und_Cyrl_TJ", "tg_Cyrl_TJ"}, + {"und_Cyrl_UA", "uk_Cyrl_UA"}, + {"arc_Hatr", "arc_Hatr_IQ"}, + {"hnj_Hmng", "hnj_Hmng_LA"}, + {"bap_Krai", "bap_Krai_IN"}, }); /** @@ -658,8 +708,11 @@ private static Map generatePopulationData(Map to // Set,Double> rowsToCounts = new TreeMap(); MaxData maxData = new MaxData(); Set cldrLocales = factory.getAvailable(); + // skip ZZ Set otherTerritories = - new TreeSet<>(standardCodes.getGoodAvailableCodes("territory")); + new TreeSet<>( + Sets.difference( + standardCodes.getGoodAvailableCodes("territory"), Set.of("ZZ"))); // process all the information to get the top values for each triple. // each of the combinations of 1 or 2 components gets to be a key. @@ -691,23 +744,26 @@ private static Map generatePopulationData(Map to if (data.getOfficialStatus() == OfficialStatus.unknown) { final String locale = writtenLanguage + "_" + region; - if (literatePopulation >= minimalLiteratePopulation) { - // ok, skip - } else if (literatePopulation >= MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE - && cldrLocales.contains(locale)) { - // ok, skip - } else { - // if (SHOW_ADD) - // System.out.println("Skipping:\t" + writtenLanguage + "\t" + region + "\t" - // + english.getName(locale) - // + "\t-- too small:\t" + number.format(literatePopulation)); - // continue; - } + // if (literatePopulation >= minimalLiteratePopulation) { + // // ok, skip + // } else if (literatePopulation >= + // MIN_UNOFFICIAL_CLDR_LANGUAGE_SIZE + // && cldrLocales.contains(locale)) { + // // ok, skip + // } else { + // // if (SHOW_ADD) + // // System.out.println("Skipping:\t" + writtenLanguage + // + "\t" + region + "\t" + // // + english.getName(locale) + // // + "\t-- too small:\t" + + // number.format(literatePopulation)); + // // continue; + // } order *= UNOFFICIAL_SCALE_DOWN; if (watching(SHOW_POP, writtenLanguage)) System.out.println( JOIN_TAB.join( - "Pop:", + "Scaling unofficial: ", writtenLanguage, region, getNameSafe(locale), @@ -778,7 +834,10 @@ private static Map generatePopulationData(Map to // add others, with English default for (String region : otherTerritories) { - if (region.length() == 3) continue; // FIX ONCE WE ADD REGIONS + if (!LocaleValidator.ALLOW_IN_LIKELY.isAllowed(LstrType.region, region, null, null)) { + continue; + } + if (region.length() == 3) continue; // handled with exceptions maxData.add("en", "Latn", region, 1.0); } @@ -797,23 +856,36 @@ private static Map generatePopulationData(Map to if (replacements == null) { continue; } - String goodLanguage = replacements.get(0); String badLanguage = str.getKey(); - if (badLanguage.contains("_")) { + if (badLanguage.contains("_")) { // only single subtag continue; } + if (deprecatedISONotInLST.contains(badLanguage)) { continue; } + + if (LANGUAGE_CODE_TO_STATUS.get(badLanguage) != Validity.Status.regular) { + if (!LocaleValidator.ALLOW_IN_LIKELY.isAllowed( + LstrType.language, badLanguage, null, null)) { + continue; + } + } + + // see what the values are for the replacements + + String goodLanguage = replacements.get(0); Set> goodLanguageData = maxData.languages.getAll(goodLanguage); if (goodLanguageData == null) { continue; } + R3 value = goodLanguageData.iterator().next(); final String script = value.get1(); final String region = value.get2(); + maxData.add(badLanguage, script, region, 1.0); System.out.println( "Adding aliases: " @@ -829,8 +901,8 @@ private static Map generatePopulationData(Map to // now, get the best for each one for (String language : maxData.languages.keySet()) { R3 value = maxData.languages.getAll(language).iterator().next(); - final Comparable script = value.get1(); - final Comparable region = value.get2(); + final String script = value.get1(); + final String region = value.get2(); add( language, language + "_" + script + "_" + region, @@ -992,12 +1064,29 @@ private static Map generatePopulationData(Map to TreeSet sorted = new TreeSet<>(ScriptMetadata.getScripts()); for (String script : sorted) { + switch (SCRIPT_CODE_TO_STATUS.get(script)) { + case special: + case unknown: + continue; + default: + break; + } Info i = ScriptMetadata.getInfo(script); String likelyLanguage = i.likelyLanguage; + String originCountry = i.originCountry; if (LANGUAGE_CODE_TO_STATUS.get(likelyLanguage) == Status.special) { likelyLanguage = LocaleNames.UND; } - String originCountry = i.originCountry; + LanguageTagParser ltp = + new LanguageTagParser() + .setLanguage(likelyLanguage) + .setScript(script) + .setRegion(originCountry); + Set errors = new LinkedHashSet<>(); + if (!LocaleValidator.isValid(ltp, LocaleValidator.ALLOW_IN_LIKELY, errors)) { + System.out.println(JOIN_LS.join("Failure in ScriptMetaData: " + ltp, errors)); + continue; + } final String result = likelyLanguage + "_" + script + "_" + originCountry; add("und_" + script, result, toMaximized, "S->LR•", LocaleOverride.KEEP_EXISTING); add(likelyLanguage, result, toMaximized, "L->SR•", LocaleOverride.KEEP_EXISTING); @@ -1044,7 +1133,7 @@ private static Map generatePopulationData(Map to toMaximized.remove(row.get(0)); } } - return toMaximized; + return CldrUtility.protectCollection(toMaximized); } /** Class for maximizing data sources */ @@ -1090,6 +1179,17 @@ public static class MaxData { * @param order */ void add(String language, String script, String region, Double order) { + // check for bad codes sneaking in + LanguageTagParser ltp = + new LanguageTagParser() + .setLanguage(language) + .setScript(script) + .setRegion(region); + Set errors = new LinkedHashSet<>(); + if (!LocaleValidator.isValid(ltp, LocaleValidator.ALLOW_IN_LIKELY, errors)) { + System.out.println(JOIN_LS.join("Bad Add of " + ltp, errors)); + } + if (watching(SHOW_ORDER, language)) System.out.println( JOIN_TAB.join( @@ -1161,12 +1261,8 @@ private static void add( "", kind)); } - } else if (override == LocaleOverride.KEEP_EXISTING || value.equals(oldValue)) { - // if (showAction) { - // System.out.println("Skipping:\t" + key + "\t→\t" + value + "\t\t\t\t" + kind); - // } - return; - } else { + toAdd.put(key, value); + } else if (override != LocaleOverride.KEEP_EXISTING && !value.equals(oldValue)) { if (watching(showAction, key, value)) { System.out.println( JOIN_TAB.join( @@ -1182,8 +1278,8 @@ private static void add( getNameSafe(oldValue), kind)); } + toAdd.put(key, value); } - toAdd.put(key, value); } public static String truncateLongString(Object data, int maxLen) { @@ -1197,65 +1293,177 @@ public static String truncateLongString(Object data, int maxLen) { return info; } + enum LsrType { + LSR, + LS, + LR, + SR, + L, + S, + R + } + + /** + * Minimize
+ * We know that the following algorithm will be used in the lookup, so we remove mappings that + * are redundant. https://cldr-smoke.unicode.org/spec/main/ldml/tr35.html#likely-subtags
+ * A subtag is called empty if it is a missing script or region subtag, or it is a base language + * subtag with the value "und". In the description below, a subscript on a subtag x indicates + * which tag it is from: xs is in the source, xm is in a match, and xr is in the final result. + * + *

Lookup. Look up each of the following in order, and stop on the first match: + * + *

    + *
  1. languages_scripts_regions + *
  2. languages_scripts + *
  3. languages_regions + *
  4. languages + *
+ * + *

Return + * + *

+ * + *

    + *
  1. If there is no match, signal an error and stop. + *
  2. Otherwise there is a match = languagem_scriptm_regionm + *
  3. Let xr = xs if xs is neither empty nor 'und', and xm otherwise. + *
  4. Return the language tag composed of languager_scriptr_regionr + variants + extensions. + *
+ */ public static Map minimize( - Map fluffup, Map itemsRemoved) { - LanguageTagParser parser = new LanguageTagParser(); - LanguageTagParser targetParser = new LanguageTagParser(); - Map removals = new TreeMap<>(); + Map max, Map itemsRemoved) { + + final LanguageTagParser sourceParser = new LanguageTagParser(); + final Map removals = new TreeMap<>(); + final Map toMinimize = new TreeMap<>(LOCALE_SOURCE); + final Output intermediate = new Output<>(); + + toMinimize.putAll(max); + + // Remove redundant mappings. + // For example, suppose we have the following mappings: + // {aa=aa_Latn_ET, aa_DJ=aa_Latn_DJ, aa_ER=aa_Latn_ER} + // Using the algorithm above if aa_DJ=aa_Latn_DJ were not there we would + // 1. check for aa_DJ, fail + // 2. check for aa, get aa_Latn_ET, and substitute DJ for ET, getting the right answer. + // So aa_DJ is redundant + + // Dependencies + // We should never have an LocaleScriptInfo.UNKNOWN_REGION, or + // LocaleScriptInfo.UNKNOWN_SCRIPT + // The unit tests will guarantee this if somehow we slip up + // Similarly, we should never have the target have language="und", or be missing script or + // region + // We also know that the source never has 3 full fields (ie, never L≠und && S≠"" && R≠"") + + // Make multiple passes if necessary for (int pass = 0; ; ++pass) { removals.clear(); - for (Entry entry : fluffup.entrySet()) { - String locale = entry.getKey(); + for (Entry entry : toMinimize.entrySet()) { + String source = entry.getKey(); + if (source.equals("und")) { + continue; // never remove + } String target = entry.getValue(); - - if (targetParser.set(target).getRegion().equals(LocaleScriptInfo.UNKNOWN_REGION)) { - removals.put(locale, target); - showRemoving(pass, locale, target, "Unknown Region in target"); - continue; + if (source.equals("aa_DJ") || source.equals("und_Arab_AF")) { + int debug = 0; } - if (targetParser.getScript().equals(LocaleScriptInfo.UNKNOWN_SCRIPT)) { - removals.put(locale, target); - showRemoving(pass, locale, target, "Unknown Script in target"); - continue; + sourceParser.set(source); + + if (!sourceParser.getLanguage().equals("und") + && !sourceParser.getScript().isEmpty() + && !sourceParser.getRegion().isEmpty()) { + throw new IllegalArgumentException("Bogus source: " + source); } - String region = parser.set(locale).getRegion(); - if (region.length() != 0) { - if (region.equals(LocaleScriptInfo.UNKNOWN_REGION)) { - removals.put(locale, target); - showRemoving(pass, locale, target, "Unknown Region in source"); - continue; - } - parser.setRegion(""); - String newLocale = parser.toString(); - String newTarget = fluffup.get(newLocale); - if (newTarget != null) { - newTarget = targetParser.set(newTarget).setRegion(region).toString(); - if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { - removals.put(locale, target); - showRemoving(pass, locale, target, "Redundant with\t" + newLocale); + // The following has some redundant checks, but it makes the + // code more convoluted to catch them, and perf is not an issue. + + String trial; + + // und_Cyrl_RU => ru_Cyrl_RU, but und_Cyrl => ru_Cyrl_RU + // und_Latn_DE => de_Latn_DE, but und_DE => de_Latn_DE + // und_Latn_US => en_Latn_US, but und => en_Latn_US + + if (!sourceParser.getScript().isEmpty() && !sourceParser.getRegion().isEmpty()) { + trial = + compose( + sourceParser.getLanguage(), + sourceParser.getScript(), + sourceParser.getRegion()); + if (!trial.equals(source)) { + String result = + matchAndFill( + sourceParser, trial, removals, toMinimize, intermediate); + if (target.equals(result)) { + removals.put(source, target); + showRemoving(LsrType.LSR, source, target, trial, intermediate.value); continue; } } } - String script = parser.set(locale).getScript(); - if (locale.equals(DEBUG_ADD_KEY)) { - System.out.println("*debug*"); + + // de_Latn => de_Latn_DE, but de => de_Latn_DE + // und_Cyrl => ru_Cyrl_RU, but ru_Cyrl => ru_Cyrl_RU + + if (!sourceParser.getScript().isEmpty()) { + trial = compose(sourceParser.getLanguage(), sourceParser.getScript(), ""); + if (!trial.equals(source)) { + String result = + matchAndFill( + sourceParser, trial, removals, toMinimize, intermediate); + if (target.equals(result)) { + removals.put(source, target); + showRemoving(LsrType.LS, source, target, trial, intermediate.value); + continue; + } + } } - if (script.length() != 0) { - if (script.equals(LocaleScriptInfo.UNKNOWN_SCRIPT)) { - removals.put(locale, target); - showRemoving(pass, locale, target, "Unknown Script"); - continue; + + // de_DE => de_Latn_DE, but de => de_Latn_DE + // und_RU => ru_Cyrl_RU, but ru_RU => ru_Cyrl_RU + + if (!sourceParser.getRegion().isEmpty()) { + trial = compose(sourceParser.getLanguage(), "", sourceParser.getRegion()); + if (!trial.equals(source)) { + + String result = + matchAndFill( + sourceParser, trial, removals, toMinimize, intermediate); + if (target.equals(result) + && !fieldChangesLanguage( + LsrType.S, sourceParser, removals, toMinimize)) { + removals.put(source, target); + showRemoving(LsrType.LR, source, target, trial, intermediate.value); + continue; + } } - parser.setScript(""); - String newLocale = parser.toString(); - String newTarget = fluffup.get(newLocale); - if (newTarget != null) { - newTarget = targetParser.set(newTarget).setScript(script).toString(); - if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) { - removals.put(locale, target); - showRemoving(pass, locale, target, "Redundant with\t" + newLocale); + } + + // ultimate fallback + + if (true) { + trial = sourceParser.getLanguage(); + if (!trial.equals(source)) { + String result = + matchAndFill( + sourceParser, trial, removals, toMinimize, intermediate); + if (target.equals(result) + && (sourceParser.getScript().isEmpty() + || sourceParser.getRegion().isEmpty() + || (!fieldChangesLanguage( + LsrType.S, + sourceParser, + removals, + toMinimize) + && !fieldChangesLanguage( + LsrType.R, + sourceParser, + removals, + toMinimize)))) { + removals.put(source, target); + showRemoving(LsrType.L, source, target, trial, intermediate.value); continue; } } @@ -1266,16 +1474,141 @@ public static Map minimize( } itemsRemoved.putAll(removals); for (String locale : removals.keySet()) { - fluffup.remove(locale); + toMinimize.remove(locale); + } + } + return CldrUtility.protectCollection(toMinimize); + } + + public static boolean fieldChangesLanguage( + LsrType lsrType, + final LanguageTagParser sourceParser, + final Map removals, + final Map toMinimize) { + if (!isEmpty(sourceParser, lsrType)) { + final LanguageTagParser tempParser = new LanguageTagParser(); + copyFrom(tempParser, LsrType.L, sourceParser); + copyFrom(tempParser, lsrType, sourceParser); + + // Special Check! + // Suppose we have + // (A) und_Arab_AF => fa_Arab_AF + // It appears we can remove (A) because we have + // (B) und_AF ==> fa_Arab_AF + // However, because script is checked before region + // We will have a first have a hit on + // (C) und_Arab => ar_Arab_xx + // Which will result in the wrong answer (ar_Arab_AF). + + String trial2 = + compose(sourceParser.getLanguage(), getField(sourceParser, lsrType), ""); + String result2 = matchAndFill(sourceParser, trial2, removals, toMinimize, null); + if (result2 != null) { + final LanguageTagParser tempParser2 = new LanguageTagParser(); + tempParser2.set(result2); + String lang2 = tempParser2.getLanguage(); + String tempLang = tempParser.getLanguage(); + if (tempLang != lang2) { + return true; + } + } + } + return false; + } + + // Some of these would be useful on LanguageTagParser + + public static String getField(LanguageTagParser fromParser, LsrType lsr) { + switch (lsr) { + case L: + return fromParser.getLanguage(); + case S: + return fromParser.getScript(); + case R: + return fromParser.getRegion(); + default: + throw new IllegalArgumentException(); + } + } + + public static LanguageTagParser copyFrom( + LanguageTagParser intoParser, LsrType lsr, LanguageTagParser fromParser) { + switch (lsr) { + case L: + intoParser.setLanguage(fromParser.getLanguage()); + break; + case S: + intoParser.setScript(fromParser.getScript()); + break; + case R: + intoParser.setRegion(fromParser.getRegion()); + break; + default: + throw new IllegalArgumentException(); + } + return intoParser; + } + + public static LanguageTagParser ifEmptyCopyFrom( + LanguageTagParser intoParser, LsrType lsr, LanguageTagParser fromParser) { + return isEmpty(intoParser, lsr) ? intoParser : copyFrom(intoParser, lsr, fromParser); + } + + public static boolean isEmpty(LanguageTagParser intoParser, LsrType lsr) { + return getField(intoParser, lsr).equals(lsr == LsrType.L ? "und" : ""); + } + + public static String matchAndFill( + LanguageTagParser sourceParser, + String trial, + Map removals, + Map toMinimize, + Output intermediate) { + String possibleSuper; + String result; + possibleSuper = removals.containsKey(trial) ? null : toMinimize.get(trial); + result = null; + if (possibleSuper != null) { + LanguageTagParser tempParser3 = new LanguageTagParser(); + tempParser3.set(possibleSuper); + if (!sourceParser.getLanguage().equals("und")) { + tempParser3.setLanguage(sourceParser.getLanguage()); + } + if (!getField(sourceParser, LsrType.S).isEmpty()) { + copyFrom(tempParser3, LsrType.S, sourceParser); + } + if (!sourceParser.getRegion().isEmpty()) { + tempParser3.setRegion(sourceParser.getRegion()); } + result = tempParser3.toString(); } - return fluffup; + if (intermediate != null) { + intermediate.value = possibleSuper; + } + return result; + } + + private static String compose(String lang, String script, String region) { + String result = lang; + if (!script.isEmpty()) { + result += "_" + script; + } + if (!region.isEmpty()) { + result += "_" + region; + } + return result; + } + + static class MapView { + K skip; } public static void showRemoving( - Object pass, String locale, String target, final String reason) { - if (watching(SHOW_MIN, target)) { - System.out.println(JOIN_TAB.join(pass, "Removing:", locale, "→", target, "", reason)); + Object pass, String locale, String target, String fallback, String fallbackTarget) { + if (watching(SHOW_MIN, locale, target, fallback, fallbackTarget)) { + System.out.println( + JOIN_TAB.join( + pass, "Removing: ", locale, "→", target, fallback, fallbackTarget)); } } @@ -1309,7 +1642,7 @@ public static String printingName(String locale, Joiner spacing) { static final String arrow = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t⇒\t" : "\t➡ "; private static File printLikelySubtags(Map fluffup) throws IOException { - final File genDir = new File(CLDRPaths.GEN_DIRECTORY, "supplemental"); + final File genDir = new File(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY); final File genFile = new File( genDir, diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/LangTagsData.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/LangTagsData.java index 488e9f41641..f210ab0abec 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/LangTagsData.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/LangTagsData.java @@ -5,9 +5,9 @@ import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; import com.google.common.collect.TreeMultimap; +import com.ibm.icu.util.ICUUncheckedIOException; import com.ibm.icu.util.Output; import java.io.IOException; -import java.io.UncheckedIOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -260,7 +260,7 @@ private Map readJson() { processErrors.data = CldrUtility.protectCollection(processErrors.data); return CldrUtility.protectCollection(result); } catch (IOException ex) { - throw new UncheckedIOException(ex); + throw new ICUUncheckedIOException(ex); } } @@ -315,7 +315,7 @@ private Multimap readWikidata() { } }); } catch (IOException ex) { - throw new UncheckedIOException(ex); + throw new ICUUncheckedIOException(ex); } return ImmutableMultimap.copyOf(result); } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleValidator.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleValidator.java index 0a6532e4d59..b83c5b57d59 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleValidator.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleValidator.java @@ -22,6 +22,15 @@ public class LocaleValidator { static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance(); + /** For backwards compatibility, certain non-regular codes are allowed in LikelySubtags. */ + public static final LocaleValidator.AllowedValid ALLOW_IN_LIKELY = + new LocaleValidator.AllowedValid( + null, + LstrType.region, + new LocaleValidator.AllowedMatch("001|419"), + LstrType.language, + new LocaleValidator.AllowedMatch("und|in|iw|ji|jw|mo|tl")); + static final Validity VALIDITY = Validity.getInstance(); static final Set FIELD_ALLOWS_EMPTY = Set.of(LstrType.script, LstrType.region); // Map>> @@ -100,18 +109,21 @@ public static class AllowedValid { private final Set allowedStatus; // allowed without exception private final Multimap allowedExceptions; - boolean isAllowed(Validity.Status status) { + public boolean isAllowed(Validity.Status status) { return allowedStatus.contains(status); } /** Only called if isAllowed is not true */ - boolean isAllowed(LstrType lstrType, String key, String value, Validity.Status status) { + public boolean isAllowed( + LstrType lstrType, String key, String value, Validity.Status status) { Collection allowedMatches = allowedExceptions.get(lstrType); if (allowedMatches == null) { return false; } for (AllowedMatch allowedMatch : allowedMatches) { - if (allowedMatch.matches(key, value, status)) return true; + if (allowedMatch.matches(key, value, status)) { + return true; + } } return false; } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java index a1491b31f63..e71a1ad97f4 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java @@ -463,7 +463,7 @@ public BasicLanguageData addTerritory(String territory) { throw new IllegalArgumentException("Illegal Territory: " + territory); } if (territories == Collections.EMPTY_SET) { - territories = new TreeSet<>(); + territories = new LinkedHashSet<>(); } territories.add(territory); return this; diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Script_Metadata.csv b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Script_Metadata.csv index bdf7170097b..408ab50a7ab 100644 --- a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Script_Metadata.csv +++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Script_Metadata.csv @@ -128,7 +128,7 @@ WR,Name,Script_Code,Age,Size,Sample,Sample_Code,Origin Country,~Density,Likely L 126,Warang_Citi,Wara,7.0,84,𑢴,118B4,India,1,Ho,hoc,Exclusion,no,no,no,no,Yes 127,Ahom,Ahom,8.0,0,𑜗,11717,India,1,Ahom,aho,Exclusion,no,Yes,Yes,no,no 128,Anatolian_Hieroglyphs,Hluw,8.0,0,𔐀,14400,Turkey,1,Hieroglyphic Luwian,hlu,Exclusion,no,no,no,Yes,no -129,Hatran,Hatr,8.0,0,𐣴,108F4,Iraq,1,Uncoded Languages,mis,Exclusion,Yes,no,no,no,no +129,Hatran,Hatr,8.0,0,𐣴,108F4,Iraq,1,Aramaic,arc,Exclusion,Yes,no,no,no,no 130,Multani,Mult,8.0,0,𑊏,1128F,Pakistan,1,Seraiki,skr,Exclusion,no,no,no,no,no 131,Old_Hungarian,Hung,8.0,0,𐲡,10CA1,Hungary,1,Hungarian,hu,Exclusion,Yes,no,no,no,Yes 132,SignWriting,Sgnw,8.0,0,𝡐,1D850,USA,1,American Sign Language,ase,Exclusion,no,no,no,Yes,no @@ -158,7 +158,7 @@ WR,Name,Script_Code,Age,Size,Sample,Sample_Code,Origin Country,~Density,Likely L 156,Khitan small script,Kits,13.0,0,𘱥,18C65,China,2,Khitan,zkt,Exclusion,no,Yes,no,Yes,no 157,Yezidi,Yezi,13.0,0,𐺈,10E88,Georgia,1,Northern Kurdish,ku,Exclusion,Yes,no,no,no,no 158,Cypro_Minoan,Cpmn,14.0,0,𒿥,12FE5,Cyprus,2,unknown,und,Exclusion,no,no,no,Yes,no -159,Old_Uyghur,Ougr,14.0,0,𐽼,10F7C,Central Asia,1,Old Uyghur,oui,Exclusion,Yes,no,Yes,no,no +159,Old_Uyghur,Ougr,14.0,0,𐽼,10F7C,China,1,Old Uyghur,oui,Exclusion,Yes,no,Yes,no,no 160,Tangsa,Tnsa,14.0,0,𖪼,16ABC,India,1,Tangsa,nst,Exclusion,no,no,no,no,no 161,Toto,Toto,14.0,0,𞊐,1E290,India,1,Toto,txo,Exclusion,no,no,no,no,no 162,Vithkuqi,Vith,14.0,0,𐖂,10582,Albania,1,Albanian,sq,Exclusion,no,no,no,no,Yes diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv index cf82e12dde7..64c5e34ffd8 100644 --- a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv +++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv @@ -1354,7 +1354,7 @@ Turkey TR "81,257,239" 94% "2,186,000,000,000" Balkan Gagauz Turkish bgx "370,0 Turkey TR "81,257,239" 94% "2,186,000,000,000" Bulgarian bg "341,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" English en 17% Turkey TR "81,257,239" 94% "2,186,000,000,000" Georgian ka "45,300" -Turkey TR "81,257,239" 94% "2,186,000,000,000" Kara-Kalpak kaa 1% https://joshuaproject.net/languages/kaa +Turkey TR "81,257,239" 94% "2,186,000,000,000" Kara-Kalpak kaa 0.1% https://joshuaproject.net/languages/kaa Turkey TR "81,257,239" 94% "2,186,000,000,000" Greek el "4,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" Kabardian kbd "623,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" Kazakh kk 600 "http://en.wikipedia.org/wiki/Kazakh_language - the script is an assumption, needs a reference" diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java index 6bfa2a607ff..f7aa69dbb41 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java @@ -39,11 +39,11 @@ import org.unicode.cldr.util.Factory; import org.unicode.cldr.util.LanguageTagParser; import org.unicode.cldr.util.Level; +import org.unicode.cldr.util.LocaleValidator; import org.unicode.cldr.util.ScriptToExemplars; import org.unicode.cldr.util.StandardCodes; import org.unicode.cldr.util.StandardCodes.LstrType; import org.unicode.cldr.util.SupplementalDataInfo; -import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; import org.unicode.cldr.util.Validity; import org.unicode.cldr.util.Validity.Status; @@ -168,6 +168,7 @@ void add(LanguageTagParser ltp, boolean source) { final LanguageTagParser maxLtp = new LanguageTagParser(); final LanguageTagParser sourceLtp = new LanguageTagParser(); + final Set KNOWN_ERRORS = Set.of("en_Latn_MU", "en_Latn_SL", "en_Latn_TK", "en_Latn_ZM"); /** * Return false if we should skip the language * @@ -206,15 +207,22 @@ public boolean checkAdding(String source) { sourceLtp.setRegion(maxLtp.getRegion()); } String test = sourceLtp.toString(); - final String maximize = LIKELY.maximize(test); + String maximize = LIKELY.maximize(test); if (!max.equals(maximize)) { - // max(source) = max, max(test) ≠ max - if (!assertEquals( - String.format( - "checkAdding: max(%s)->%s, however max(%s)->", source, max, test), - max, - maximize)) { - // LIKELY.maximize(test); // Could step into this for debugging. + if (KNOWN_ERRORS.contains(maximize)) { + logKnownIssue("CLDR-17897", "Fix GenerateLikelySubtags.java"); + continue; + } + if (!max.equals(maximize)) { + // max(source) = max, max(test) ≠ max + if (!assertEquals( + String.format( + "checkAdding: max(%s)->%s, however max(%s)->", + source, max, test), + max, + maximize)) { + // LIKELY.maximize(test); // Could step into this for debugging. + } } } sourceLtp.set(source); // restore @@ -703,85 +711,60 @@ public void testUndAllScriptsAndRegions() { } } - LanguageTagParser ltp = new LanguageTagParser(); - Set possibleFixes = new TreeSet<>(); - for (String region : regions) { - final String undRegion = "und_" + region; - if (region.equals("150") && likely.containsKey("und")) { - // skip - } else if (!assertTrue("contains und_" + region, likely.containsKey(undRegion))) { - Set languages = - SUPPLEMENTAL_DATA_INFO.getLanguagesForTerritoryWithPopulationData(region); - double biggest = -1; - String biggestLang = null; - for (String language : languages) { - PopulationData popData = - SUPPLEMENTAL_DATA_INFO.getLanguageAndTerritoryPopulationData( - language, region); - if (popData.getLiteratePopulation() > biggest) { - biggest = popData.getLiteratePopulation(); - biggestLang = language; - } - } - if (biggestLang != null) { - ltp.set(biggestLang); - if (ltp.getScript().isEmpty()) { - String biggestMax = likely.get(biggestLang); - ltp.set(biggestMax); - } - ltp.setRegion(region); - possibleFixes.add( - ""); - } - } - } - System.out.println("\t\t" + Joiner.on("\n\t\t").join(possibleFixes)); + + // Note: this used to test for all combinations of und_ + territory code. + // But we are now dropping redundant items, so any case where und_XX expands to en_Latn_XX, + // the und_XX is dropped. + // The code is just commented out in case we change in the future. + + // LanguageTagParser ltp = new LanguageTagParser(); + // Set possibleFixes = new TreeSet<>(); + // for (String region : regions) { + // final String undRegion = "und_" + region; + // if (region.equals("150") && likely.containsKey("und")) { + // // skip + // } else if (!assertTrue("contains und_" + region, + // likely.containsKey(undRegion))) { + // Set languages = + // + // SUPPLEMENTAL_DATA_INFO.getLanguagesForTerritoryWithPopulationData(region); + // double biggest = -1; + // String biggestLang = null; + // for (String language : languages) { + // PopulationData popData = + // SUPPLEMENTAL_DATA_INFO.getLanguageAndTerritoryPopulationData( + // language, region); + // if (popData.getLiteratePopulation() > biggest) { + // biggest = popData.getLiteratePopulation(); + // biggestLang = language; + // } + // } + // if (biggestLang != null) { + // ltp.set(biggestLang); + // if (ltp.getScript().isEmpty()) { + // String biggestMax = likely.get(biggestLang); + // ltp.set(biggestMax); + // } + // ltp.setRegion(region); + // possibleFixes.add( + // ""); + // } + // } + // } + // System.out.println("\t\t" + Joiner.on("\n\t\t").join(possibleFixes)); } + private static final Joiner JOIN_LS = Joiner.on(CldrUtility.LINE_SEPARATOR); + public void testToAttributeValidityStatus() { - Set okLanguages = VALIDITY.getStatusToCodes(LstrType.language).get(Status.regular); - Set okScripts = VALIDITY.getStatusToCodes(LstrType.script).get(Status.regular); - Set okRegions = VALIDITY.getStatusToCodes(LstrType.region).get(Status.regular); Multimap badFieldsToLocales = TreeMultimap.create(); - Set knownExceptions = Set.of("in", "iw", "ji", "jw", "mo", "tl"); for (String s : likely.values()) { - CLDRLocale cLocale = CLDRLocale.getInstance(s); - final String language = cLocale.getLanguage(); - final String script = cLocale.getScript(); - final String region = cLocale.getCountry(); - if (!okLanguages.contains(language)) { - if (knownExceptions.contains(language)) { - continue; - } - badFieldsToLocales.put(language, s); - } - if (!okScripts.contains(script)) { - badFieldsToLocales.put(script, s); - } - if (!okRegions.contains(region)) { - badFieldsToLocales.put(region, s); - } - } - if (!badFieldsToLocales.isEmpty()) { - Multimap statusToExamples = TreeMultimap.create(); - for (String field : badFieldsToLocales.keySet()) { - Status status = VALIDITY.getCodeToStatus(LstrType.language).get(field); - if (status == null) { - status = VALIDITY.getCodeToStatus(LstrType.script).get(field); - } - if (status == null) { - status = VALIDITY.getCodeToStatus(LstrType.region).get(field); - } - statusToExamples.put(status, field); - } - Map fieldToOrigin = new TreeMap<>(); - for (Entry> entry : statusToExamples.asMap().entrySet()) { - // for (String value : entry.getValue()) { - // String origin = - // SUPPLEMENTAL_DATA_INFO.getLikelyOrigins().get(value); - // fieldToOrigin.put(value, origin == null ? "n/a" : origin); - // } - warnln("Bad status=" + entry.getKey() + " for " + entry.getValue()); + LanguageTagParser ltp = new LanguageTagParser().set(s); + Set errors = new LinkedHashSet<>(); + if (!LocaleValidator.isValid(ltp, LocaleValidator.ALLOW_IN_LIKELY, errors)) { + errln(Joiner.on('\t').join("Allowed subtag failure:", ltp, errors)); + continue; } } } diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestLocale.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestLocale.java index bd269f492ee..353a724fae5 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestLocale.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestLocale.java @@ -900,14 +900,6 @@ public void testLanguageTagParserIsValid() { // likely subtags - LocaleValidator.AllowedValid allow001 = - new LocaleValidator.AllowedValid( - null, - LstrType.region, - new LocaleValidator.AllowedMatch("001|419"), - LstrType.language, - new LocaleValidator.AllowedMatch("und|in|iw|ji|jw|mo|tl")); - Map exceptions = Map.of( // "und_QO", "Disallowed region=QO, status=macroregion" @@ -918,13 +910,13 @@ public void testLanguageTagParserIsValid() { final String value = entry.getValue(); String expected = CldrUtility.ifNull(exceptions.get(key), ""); - LocaleValidator.isValid(ltp.set(key), allow001, errors); + LocaleValidator.isValid(ltp.set(key), LocaleValidator.ALLOW_IN_LIKELY, errors); assertEquals(key, expected, Joiner.on("; ").join(errors)); if (!expected.isEmpty()) { warnln("Likely subtags, skipping " + ltp + ", " + expected); } - LocaleValidator.isValid(ltp.set(value), allow001, errors); + LocaleValidator.isValid(ltp.set(value), LocaleValidator.ALLOW_IN_LIKELY, errors); assertEquals(value, "", Joiner.on("; ").join(errors)); }