errors = new LinkedHashSet<>();
+ if (!LocaleValidator.isValid(ltp, LocaleValidator.ALLOW_IN_LIKELY, errors)) {
+ System.out.println(JOIN_LS.join("Bad Add of " + ltp, errors));
+ }
+
if (watching(SHOW_ORDER, language))
System.out.println(
JOIN_TAB.join(
@@ -1161,12 +1261,8 @@ private static void add(
"",
kind));
}
- } else if (override == LocaleOverride.KEEP_EXISTING || value.equals(oldValue)) {
- // if (showAction) {
- // System.out.println("Skipping:\t" + key + "\t→\t" + value + "\t\t\t\t" + kind);
- // }
- return;
- } else {
+ toAdd.put(key, value);
+ } else if (override != LocaleOverride.KEEP_EXISTING && !value.equals(oldValue)) {
if (watching(showAction, key, value)) {
System.out.println(
JOIN_TAB.join(
@@ -1182,8 +1278,8 @@ private static void add(
getNameSafe(oldValue),
kind));
}
+ toAdd.put(key, value);
}
- toAdd.put(key, value);
}
public static String truncateLongString(Object data, int maxLen) {
@@ -1197,65 +1293,177 @@ public static String truncateLongString(Object data, int maxLen) {
return info;
}
+ enum LsrType {
+ LSR,
+ LS,
+ LR,
+ SR,
+ L,
+ S,
+ R
+ }
+
+ /**
+ * Minimize
+ * We know that the following algorithm will be used in the lookup, so we remove mappings that
+ * are redundant. https://cldr-smoke.unicode.org/spec/main/ldml/tr35.html#likely-subtags
+ * A subtag is called empty if it is a missing script or region subtag, or it is a base language
+ * subtag with the value "und". In the description below, a subscript on a subtag x indicates
+ * which tag it is from: xs is in the source, xm is in a match, and xr is in the final result.
+ *
+ * Lookup. Look up each of the following in order, and stop on the first match:
+ *
+ *
+ * - languages_scripts_regions
+ *
- languages_scripts
+ *
- languages_regions
+ *
- languages
+ *
+ *
+ * Return
+ *
+ *
+ *
+ *
+ * - If there is no match, signal an error and stop.
+ *
- Otherwise there is a match = languagem_scriptm_regionm
+ *
- Let xr = xs if xs is neither empty nor 'und', and xm otherwise.
+ *
- Return the language tag composed of languager_scriptr_regionr + variants + extensions.
+ *
+ */
public static Map minimize(
- Map fluffup, Map itemsRemoved) {
- LanguageTagParser parser = new LanguageTagParser();
- LanguageTagParser targetParser = new LanguageTagParser();
- Map removals = new TreeMap<>();
+ Map max, Map itemsRemoved) {
+
+ final LanguageTagParser sourceParser = new LanguageTagParser();
+ final Map removals = new TreeMap<>();
+ final Map toMinimize = new TreeMap<>(LOCALE_SOURCE);
+ final Output intermediate = new Output<>();
+
+ toMinimize.putAll(max);
+
+ // Remove redundant mappings.
+ // For example, suppose we have the following mappings:
+ // {aa=aa_Latn_ET, aa_DJ=aa_Latn_DJ, aa_ER=aa_Latn_ER}
+ // Using the algorithm above if aa_DJ=aa_Latn_DJ were not there we would
+ // 1. check for aa_DJ, fail
+ // 2. check for aa, get aa_Latn_ET, and substitute DJ for ET, getting the right answer.
+ // So aa_DJ is redundant
+
+ // Dependencies
+ // We should never have an LocaleScriptInfo.UNKNOWN_REGION, or
+ // LocaleScriptInfo.UNKNOWN_SCRIPT
+ // The unit tests will guarantee this if somehow we slip up
+ // Similarly, we should never have the target have language="und", or be missing script or
+ // region
+ // We also know that the source never has 3 full fields (ie, never L≠und && S≠"" && R≠"")
+
+ // Make multiple passes if necessary
for (int pass = 0; ; ++pass) {
removals.clear();
- for (Entry entry : fluffup.entrySet()) {
- String locale = entry.getKey();
+ for (Entry entry : toMinimize.entrySet()) {
+ String source = entry.getKey();
+ if (source.equals("und")) {
+ continue; // never remove
+ }
String target = entry.getValue();
-
- if (targetParser.set(target).getRegion().equals(LocaleScriptInfo.UNKNOWN_REGION)) {
- removals.put(locale, target);
- showRemoving(pass, locale, target, "Unknown Region in target");
- continue;
+ if (source.equals("aa_DJ") || source.equals("und_Arab_AF")) {
+ int debug = 0;
}
- if (targetParser.getScript().equals(LocaleScriptInfo.UNKNOWN_SCRIPT)) {
- removals.put(locale, target);
- showRemoving(pass, locale, target, "Unknown Script in target");
- continue;
+ sourceParser.set(source);
+
+ if (!sourceParser.getLanguage().equals("und")
+ && !sourceParser.getScript().isEmpty()
+ && !sourceParser.getRegion().isEmpty()) {
+ throw new IllegalArgumentException("Bogus source: " + source);
}
- String region = parser.set(locale).getRegion();
- if (region.length() != 0) {
- if (region.equals(LocaleScriptInfo.UNKNOWN_REGION)) {
- removals.put(locale, target);
- showRemoving(pass, locale, target, "Unknown Region in source");
- continue;
- }
- parser.setRegion("");
- String newLocale = parser.toString();
- String newTarget = fluffup.get(newLocale);
- if (newTarget != null) {
- newTarget = targetParser.set(newTarget).setRegion(region).toString();
- if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) {
- removals.put(locale, target);
- showRemoving(pass, locale, target, "Redundant with\t" + newLocale);
+ // The following has some redundant checks, but it makes the
+ // code more convoluted to catch them, and perf is not an issue.
+
+ String trial;
+
+ // und_Cyrl_RU => ru_Cyrl_RU, but und_Cyrl => ru_Cyrl_RU
+ // und_Latn_DE => de_Latn_DE, but und_DE => de_Latn_DE
+ // und_Latn_US => en_Latn_US, but und => en_Latn_US
+
+ if (!sourceParser.getScript().isEmpty() && !sourceParser.getRegion().isEmpty()) {
+ trial =
+ compose(
+ sourceParser.getLanguage(),
+ sourceParser.getScript(),
+ sourceParser.getRegion());
+ if (!trial.equals(source)) {
+ String result =
+ matchAndFill(
+ sourceParser, trial, removals, toMinimize, intermediate);
+ if (target.equals(result)) {
+ removals.put(source, target);
+ showRemoving(LsrType.LSR, source, target, trial, intermediate.value);
continue;
}
}
}
- String script = parser.set(locale).getScript();
- if (locale.equals(DEBUG_ADD_KEY)) {
- System.out.println("*debug*");
+
+ // de_Latn => de_Latn_DE, but de => de_Latn_DE
+ // und_Cyrl => ru_Cyrl_RU, but ru_Cyrl => ru_Cyrl_RU
+
+ if (!sourceParser.getScript().isEmpty()) {
+ trial = compose(sourceParser.getLanguage(), sourceParser.getScript(), "");
+ if (!trial.equals(source)) {
+ String result =
+ matchAndFill(
+ sourceParser, trial, removals, toMinimize, intermediate);
+ if (target.equals(result)) {
+ removals.put(source, target);
+ showRemoving(LsrType.LS, source, target, trial, intermediate.value);
+ continue;
+ }
+ }
}
- if (script.length() != 0) {
- if (script.equals(LocaleScriptInfo.UNKNOWN_SCRIPT)) {
- removals.put(locale, target);
- showRemoving(pass, locale, target, "Unknown Script");
- continue;
+
+ // de_DE => de_Latn_DE, but de => de_Latn_DE
+ // und_RU => ru_Cyrl_RU, but ru_RU => ru_Cyrl_RU
+
+ if (!sourceParser.getRegion().isEmpty()) {
+ trial = compose(sourceParser.getLanguage(), "", sourceParser.getRegion());
+ if (!trial.equals(source)) {
+
+ String result =
+ matchAndFill(
+ sourceParser, trial, removals, toMinimize, intermediate);
+ if (target.equals(result)
+ && !fieldChangesLanguage(
+ LsrType.S, sourceParser, removals, toMinimize)) {
+ removals.put(source, target);
+ showRemoving(LsrType.LR, source, target, trial, intermediate.value);
+ continue;
+ }
}
- parser.setScript("");
- String newLocale = parser.toString();
- String newTarget = fluffup.get(newLocale);
- if (newTarget != null) {
- newTarget = targetParser.set(newTarget).setScript(script).toString();
- if (target.equals(newTarget) && !KEEP_TARGETS.contains(locale)) {
- removals.put(locale, target);
- showRemoving(pass, locale, target, "Redundant with\t" + newLocale);
+ }
+
+ // ultimate fallback
+
+ if (true) {
+ trial = sourceParser.getLanguage();
+ if (!trial.equals(source)) {
+ String result =
+ matchAndFill(
+ sourceParser, trial, removals, toMinimize, intermediate);
+ if (target.equals(result)
+ && (sourceParser.getScript().isEmpty()
+ || sourceParser.getRegion().isEmpty()
+ || (!fieldChangesLanguage(
+ LsrType.S,
+ sourceParser,
+ removals,
+ toMinimize)
+ && !fieldChangesLanguage(
+ LsrType.R,
+ sourceParser,
+ removals,
+ toMinimize)))) {
+ removals.put(source, target);
+ showRemoving(LsrType.L, source, target, trial, intermediate.value);
continue;
}
}
@@ -1266,16 +1474,141 @@ public static Map minimize(
}
itemsRemoved.putAll(removals);
for (String locale : removals.keySet()) {
- fluffup.remove(locale);
+ toMinimize.remove(locale);
+ }
+ }
+ return CldrUtility.protectCollection(toMinimize);
+ }
+
+ public static boolean fieldChangesLanguage(
+ LsrType lsrType,
+ final LanguageTagParser sourceParser,
+ final Map removals,
+ final Map toMinimize) {
+ if (!isEmpty(sourceParser, lsrType)) {
+ final LanguageTagParser tempParser = new LanguageTagParser();
+ copyFrom(tempParser, LsrType.L, sourceParser);
+ copyFrom(tempParser, lsrType, sourceParser);
+
+ // Special Check!
+ // Suppose we have
+ // (A) und_Arab_AF => fa_Arab_AF
+ // It appears we can remove (A) because we have
+ // (B) und_AF ==> fa_Arab_AF
+ // However, because script is checked before region
+ // We will have a first have a hit on
+ // (C) und_Arab => ar_Arab_xx
+ // Which will result in the wrong answer (ar_Arab_AF).
+
+ String trial2 =
+ compose(sourceParser.getLanguage(), getField(sourceParser, lsrType), "");
+ String result2 = matchAndFill(sourceParser, trial2, removals, toMinimize, null);
+ if (result2 != null) {
+ final LanguageTagParser tempParser2 = new LanguageTagParser();
+ tempParser2.set(result2);
+ String lang2 = tempParser2.getLanguage();
+ String tempLang = tempParser.getLanguage();
+ if (tempLang != lang2) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ // Some of these would be useful on LanguageTagParser
+
+ public static String getField(LanguageTagParser fromParser, LsrType lsr) {
+ switch (lsr) {
+ case L:
+ return fromParser.getLanguage();
+ case S:
+ return fromParser.getScript();
+ case R:
+ return fromParser.getRegion();
+ default:
+ throw new IllegalArgumentException();
+ }
+ }
+
+ public static LanguageTagParser copyFrom(
+ LanguageTagParser intoParser, LsrType lsr, LanguageTagParser fromParser) {
+ switch (lsr) {
+ case L:
+ intoParser.setLanguage(fromParser.getLanguage());
+ break;
+ case S:
+ intoParser.setScript(fromParser.getScript());
+ break;
+ case R:
+ intoParser.setRegion(fromParser.getRegion());
+ break;
+ default:
+ throw new IllegalArgumentException();
+ }
+ return intoParser;
+ }
+
+ public static LanguageTagParser ifEmptyCopyFrom(
+ LanguageTagParser intoParser, LsrType lsr, LanguageTagParser fromParser) {
+ return isEmpty(intoParser, lsr) ? intoParser : copyFrom(intoParser, lsr, fromParser);
+ }
+
+ public static boolean isEmpty(LanguageTagParser intoParser, LsrType lsr) {
+ return getField(intoParser, lsr).equals(lsr == LsrType.L ? "und" : "");
+ }
+
+ public static String matchAndFill(
+ LanguageTagParser sourceParser,
+ String trial,
+ Map removals,
+ Map toMinimize,
+ Output intermediate) {
+ String possibleSuper;
+ String result;
+ possibleSuper = removals.containsKey(trial) ? null : toMinimize.get(trial);
+ result = null;
+ if (possibleSuper != null) {
+ LanguageTagParser tempParser3 = new LanguageTagParser();
+ tempParser3.set(possibleSuper);
+ if (!sourceParser.getLanguage().equals("und")) {
+ tempParser3.setLanguage(sourceParser.getLanguage());
+ }
+ if (!getField(sourceParser, LsrType.S).isEmpty()) {
+ copyFrom(tempParser3, LsrType.S, sourceParser);
+ }
+ if (!sourceParser.getRegion().isEmpty()) {
+ tempParser3.setRegion(sourceParser.getRegion());
}
+ result = tempParser3.toString();
}
- return fluffup;
+ if (intermediate != null) {
+ intermediate.value = possibleSuper;
+ }
+ return result;
+ }
+
+ private static String compose(String lang, String script, String region) {
+ String result = lang;
+ if (!script.isEmpty()) {
+ result += "_" + script;
+ }
+ if (!region.isEmpty()) {
+ result += "_" + region;
+ }
+ return result;
+ }
+
+ static class MapView {
+ K skip;
}
public static void showRemoving(
- Object pass, String locale, String target, final String reason) {
- if (watching(SHOW_MIN, target)) {
- System.out.println(JOIN_TAB.join(pass, "Removing:", locale, "→", target, "", reason));
+ Object pass, String locale, String target, String fallback, String fallbackTarget) {
+ if (watching(SHOW_MIN, locale, target, fallback, fallbackTarget)) {
+ System.out.println(
+ JOIN_TAB.join(
+ pass, "Removing: ", locale, "→", target, fallback, fallbackTarget));
}
}
@@ -1309,7 +1642,7 @@ public static String printingName(String locale, Joiner spacing) {
static final String arrow = OUTPUT_STYLE == OutputStyle.PLAINTEXT ? "\t⇒\t" : "\t➡ ";
private static File printLikelySubtags(Map fluffup) throws IOException {
- final File genDir = new File(CLDRPaths.GEN_DIRECTORY, "supplemental");
+ final File genDir = new File(CLDRPaths.DEFAULT_SUPPLEMENTAL_DIRECTORY);
final File genFile =
new File(
genDir,
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/LangTagsData.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/LangTagsData.java
index 488e9f41641..f210ab0abec 100644
--- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/LangTagsData.java
+++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/LangTagsData.java
@@ -5,9 +5,9 @@
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;
+import com.ibm.icu.util.ICUUncheckedIOException;
import com.ibm.icu.util.Output;
import java.io.IOException;
-import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
@@ -260,7 +260,7 @@ private Map readJson() {
processErrors.data = CldrUtility.protectCollection(processErrors.data);
return CldrUtility.protectCollection(result);
} catch (IOException ex) {
- throw new UncheckedIOException(ex);
+ throw new ICUUncheckedIOException(ex);
}
}
@@ -315,7 +315,7 @@ private Multimap readWikidata() {
}
});
} catch (IOException ex) {
- throw new UncheckedIOException(ex);
+ throw new ICUUncheckedIOException(ex);
}
return ImmutableMultimap.copyOf(result);
}
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleValidator.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleValidator.java
index 0a6532e4d59..b83c5b57d59 100644
--- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleValidator.java
+++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/LocaleValidator.java
@@ -22,6 +22,15 @@
public class LocaleValidator {
static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance();
+ /** For backwards compatibility, certain non-regular codes are allowed in LikelySubtags. */
+ public static final LocaleValidator.AllowedValid ALLOW_IN_LIKELY =
+ new LocaleValidator.AllowedValid(
+ null,
+ LstrType.region,
+ new LocaleValidator.AllowedMatch("001|419"),
+ LstrType.language,
+ new LocaleValidator.AllowedMatch("und|in|iw|ji|jw|mo|tl"));
+
static final Validity VALIDITY = Validity.getInstance();
static final Set FIELD_ALLOWS_EMPTY = Set.of(LstrType.script, LstrType.region);
// Map>>
@@ -100,18 +109,21 @@ public static class AllowedValid {
private final Set allowedStatus; // allowed without exception
private final Multimap allowedExceptions;
- boolean isAllowed(Validity.Status status) {
+ public boolean isAllowed(Validity.Status status) {
return allowedStatus.contains(status);
}
/** Only called if isAllowed is not true */
- boolean isAllowed(LstrType lstrType, String key, String value, Validity.Status status) {
+ public boolean isAllowed(
+ LstrType lstrType, String key, String value, Validity.Status status) {
Collection allowedMatches = allowedExceptions.get(lstrType);
if (allowedMatches == null) {
return false;
}
for (AllowedMatch allowedMatch : allowedMatches) {
- if (allowedMatch.matches(key, value, status)) return true;
+ if (allowedMatch.matches(key, value, status)) {
+ return true;
+ }
}
return false;
}
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java
index a1491b31f63..e71a1ad97f4 100644
--- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java
+++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java
@@ -463,7 +463,7 @@ public BasicLanguageData addTerritory(String territory) {
throw new IllegalArgumentException("Illegal Territory: " + territory);
}
if (territories == Collections.EMPTY_SET) {
- territories = new TreeSet<>();
+ territories = new LinkedHashSet<>();
}
territories.add(territory);
return this;
diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Script_Metadata.csv b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Script_Metadata.csv
index bdf7170097b..408ab50a7ab 100644
--- a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Script_Metadata.csv
+++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/Script_Metadata.csv
@@ -128,7 +128,7 @@ WR,Name,Script_Code,Age,Size,Sample,Sample_Code,Origin Country,~Density,Likely L
126,Warang_Citi,Wara,7.0,84,𑢴,118B4,India,1,Ho,hoc,Exclusion,no,no,no,no,Yes
127,Ahom,Ahom,8.0,0,𑜗,11717,India,1,Ahom,aho,Exclusion,no,Yes,Yes,no,no
128,Anatolian_Hieroglyphs,Hluw,8.0,0,𔐀,14400,Turkey,1,Hieroglyphic Luwian,hlu,Exclusion,no,no,no,Yes,no
-129,Hatran,Hatr,8.0,0,𐣴,108F4,Iraq,1,Uncoded Languages,mis,Exclusion,Yes,no,no,no,no
+129,Hatran,Hatr,8.0,0,𐣴,108F4,Iraq,1,Aramaic,arc,Exclusion,Yes,no,no,no,no
130,Multani,Mult,8.0,0,𑊏,1128F,Pakistan,1,Seraiki,skr,Exclusion,no,no,no,no,no
131,Old_Hungarian,Hung,8.0,0,𐲡,10CA1,Hungary,1,Hungarian,hu,Exclusion,Yes,no,no,no,Yes
132,SignWriting,Sgnw,8.0,0,𝡐,1D850,USA,1,American Sign Language,ase,Exclusion,no,no,no,Yes,no
@@ -158,7 +158,7 @@ WR,Name,Script_Code,Age,Size,Sample,Sample_Code,Origin Country,~Density,Likely L
156,Khitan small script,Kits,13.0,0,𘱥,18C65,China,2,Khitan,zkt,Exclusion,no,Yes,no,Yes,no
157,Yezidi,Yezi,13.0,0,𐺈,10E88,Georgia,1,Northern Kurdish,ku,Exclusion,Yes,no,no,no,no
158,Cypro_Minoan,Cpmn,14.0,0,𒿥,12FE5,Cyprus,2,unknown,und,Exclusion,no,no,no,Yes,no
-159,Old_Uyghur,Ougr,14.0,0,𐽼,10F7C,Central Asia,1,Old Uyghur,oui,Exclusion,Yes,no,Yes,no,no
+159,Old_Uyghur,Ougr,14.0,0,𐽼,10F7C,China,1,Old Uyghur,oui,Exclusion,Yes,no,Yes,no,no
160,Tangsa,Tnsa,14.0,0,𖪼,16ABC,India,1,Tangsa,nst,Exclusion,no,no,no,no,no
161,Toto,Toto,14.0,0,𞊐,1E290,India,1,Toto,txo,Exclusion,no,no,no,no,no
162,Vithkuqi,Vith,14.0,0,𐖂,10582,Albania,1,Albanian,sq,Exclusion,no,no,no,no,Yes
diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv
index cf82e12dde7..64c5e34ffd8 100644
--- a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv
+++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv
@@ -1354,7 +1354,7 @@ Turkey TR "81,257,239" 94% "2,186,000,000,000" Balkan Gagauz Turkish bgx "370,0
Turkey TR "81,257,239" 94% "2,186,000,000,000" Bulgarian bg "341,000"
Turkey TR "81,257,239" 94% "2,186,000,000,000" English en 17%
Turkey TR "81,257,239" 94% "2,186,000,000,000" Georgian ka "45,300"
-Turkey TR "81,257,239" 94% "2,186,000,000,000" Kara-Kalpak kaa 1% https://joshuaproject.net/languages/kaa
+Turkey TR "81,257,239" 94% "2,186,000,000,000" Kara-Kalpak kaa 0.1% https://joshuaproject.net/languages/kaa
Turkey TR "81,257,239" 94% "2,186,000,000,000" Greek el "4,000"
Turkey TR "81,257,239" 94% "2,186,000,000,000" Kabardian kbd "623,000"
Turkey TR "81,257,239" 94% "2,186,000,000,000" Kazakh kk 600 "http://en.wikipedia.org/wiki/Kazakh_language - the script is an assumption, needs a reference"
diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java
index 6bfa2a607ff..f7aa69dbb41 100644
--- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java
+++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java
@@ -39,11 +39,11 @@
import org.unicode.cldr.util.Factory;
import org.unicode.cldr.util.LanguageTagParser;
import org.unicode.cldr.util.Level;
+import org.unicode.cldr.util.LocaleValidator;
import org.unicode.cldr.util.ScriptToExemplars;
import org.unicode.cldr.util.StandardCodes;
import org.unicode.cldr.util.StandardCodes.LstrType;
import org.unicode.cldr.util.SupplementalDataInfo;
-import org.unicode.cldr.util.SupplementalDataInfo.PopulationData;
import org.unicode.cldr.util.Validity;
import org.unicode.cldr.util.Validity.Status;
@@ -168,6 +168,7 @@ void add(LanguageTagParser ltp, boolean source) {
final LanguageTagParser maxLtp = new LanguageTagParser();
final LanguageTagParser sourceLtp = new LanguageTagParser();
+ final Set KNOWN_ERRORS = Set.of("en_Latn_MU", "en_Latn_SL", "en_Latn_TK", "en_Latn_ZM");
/**
* Return false if we should skip the language
*
@@ -206,15 +207,22 @@ public boolean checkAdding(String source) {
sourceLtp.setRegion(maxLtp.getRegion());
}
String test = sourceLtp.toString();
- final String maximize = LIKELY.maximize(test);
+ String maximize = LIKELY.maximize(test);
if (!max.equals(maximize)) {
- // max(source) = max, max(test) ≠ max
- if (!assertEquals(
- String.format(
- "checkAdding: max(%s)->%s, however max(%s)->", source, max, test),
- max,
- maximize)) {
- // LIKELY.maximize(test); // Could step into this for debugging.
+ if (KNOWN_ERRORS.contains(maximize)) {
+ logKnownIssue("CLDR-17897", "Fix GenerateLikelySubtags.java");
+ continue;
+ }
+ if (!max.equals(maximize)) {
+ // max(source) = max, max(test) ≠ max
+ if (!assertEquals(
+ String.format(
+ "checkAdding: max(%s)->%s, however max(%s)->",
+ source, max, test),
+ max,
+ maximize)) {
+ // LIKELY.maximize(test); // Could step into this for debugging.
+ }
}
}
sourceLtp.set(source); // restore
@@ -703,85 +711,60 @@ public void testUndAllScriptsAndRegions() {
}
}
- LanguageTagParser ltp = new LanguageTagParser();
- Set possibleFixes = new TreeSet<>();
- for (String region : regions) {
- final String undRegion = "und_" + region;
- if (region.equals("150") && likely.containsKey("und")) {
- // skip
- } else if (!assertTrue("contains und_" + region, likely.containsKey(undRegion))) {
- Set languages =
- SUPPLEMENTAL_DATA_INFO.getLanguagesForTerritoryWithPopulationData(region);
- double biggest = -1;
- String biggestLang = null;
- for (String language : languages) {
- PopulationData popData =
- SUPPLEMENTAL_DATA_INFO.getLanguageAndTerritoryPopulationData(
- language, region);
- if (popData.getLiteratePopulation() > biggest) {
- biggest = popData.getLiteratePopulation();
- biggestLang = language;
- }
- }
- if (biggestLang != null) {
- ltp.set(biggestLang);
- if (ltp.getScript().isEmpty()) {
- String biggestMax = likely.get(biggestLang);
- ltp.set(biggestMax);
- }
- ltp.setRegion(region);
- possibleFixes.add(
- "");
- }
- }
- }
- System.out.println("\t\t" + Joiner.on("\n\t\t").join(possibleFixes));
+
+ // Note: this used to test for all combinations of und_ + territory code.
+ // But we are now dropping redundant items, so any case where und_XX expands to en_Latn_XX,
+ // the und_XX is dropped.
+ // The code is just commented out in case we change in the future.
+
+ // LanguageTagParser ltp = new LanguageTagParser();
+ // Set possibleFixes = new TreeSet<>();
+ // for (String region : regions) {
+ // final String undRegion = "und_" + region;
+ // if (region.equals("150") && likely.containsKey("und")) {
+ // // skip
+ // } else if (!assertTrue("contains und_" + region,
+ // likely.containsKey(undRegion))) {
+ // Set languages =
+ //
+ // SUPPLEMENTAL_DATA_INFO.getLanguagesForTerritoryWithPopulationData(region);
+ // double biggest = -1;
+ // String biggestLang = null;
+ // for (String language : languages) {
+ // PopulationData popData =
+ // SUPPLEMENTAL_DATA_INFO.getLanguageAndTerritoryPopulationData(
+ // language, region);
+ // if (popData.getLiteratePopulation() > biggest) {
+ // biggest = popData.getLiteratePopulation();
+ // biggestLang = language;
+ // }
+ // }
+ // if (biggestLang != null) {
+ // ltp.set(biggestLang);
+ // if (ltp.getScript().isEmpty()) {
+ // String biggestMax = likely.get(biggestLang);
+ // ltp.set(biggestMax);
+ // }
+ // ltp.setRegion(region);
+ // possibleFixes.add(
+ // "");
+ // }
+ // }
+ // }
+ // System.out.println("\t\t" + Joiner.on("\n\t\t").join(possibleFixes));
}
+ private static final Joiner JOIN_LS = Joiner.on(CldrUtility.LINE_SEPARATOR);
+
public void testToAttributeValidityStatus() {
- Set okLanguages = VALIDITY.getStatusToCodes(LstrType.language).get(Status.regular);
- Set okScripts = VALIDITY.getStatusToCodes(LstrType.script).get(Status.regular);
- Set okRegions = VALIDITY.getStatusToCodes(LstrType.region).get(Status.regular);
Multimap badFieldsToLocales = TreeMultimap.create();
- Set knownExceptions = Set.of("in", "iw", "ji", "jw", "mo", "tl");
for (String s : likely.values()) {
- CLDRLocale cLocale = CLDRLocale.getInstance(s);
- final String language = cLocale.getLanguage();
- final String script = cLocale.getScript();
- final String region = cLocale.getCountry();
- if (!okLanguages.contains(language)) {
- if (knownExceptions.contains(language)) {
- continue;
- }
- badFieldsToLocales.put(language, s);
- }
- if (!okScripts.contains(script)) {
- badFieldsToLocales.put(script, s);
- }
- if (!okRegions.contains(region)) {
- badFieldsToLocales.put(region, s);
- }
- }
- if (!badFieldsToLocales.isEmpty()) {
- Multimap statusToExamples = TreeMultimap.create();
- for (String field : badFieldsToLocales.keySet()) {
- Status status = VALIDITY.getCodeToStatus(LstrType.language).get(field);
- if (status == null) {
- status = VALIDITY.getCodeToStatus(LstrType.script).get(field);
- }
- if (status == null) {
- status = VALIDITY.getCodeToStatus(LstrType.region).get(field);
- }
- statusToExamples.put(status, field);
- }
- Map fieldToOrigin = new TreeMap<>();
- for (Entry> entry : statusToExamples.asMap().entrySet()) {
- // for (String value : entry.getValue()) {
- // String origin =
- // SUPPLEMENTAL_DATA_INFO.getLikelyOrigins().get(value);
- // fieldToOrigin.put(value, origin == null ? "n/a" : origin);
- // }
- warnln("Bad status=" + entry.getKey() + " for " + entry.getValue());
+ LanguageTagParser ltp = new LanguageTagParser().set(s);
+ Set errors = new LinkedHashSet<>();
+ if (!LocaleValidator.isValid(ltp, LocaleValidator.ALLOW_IN_LIKELY, errors)) {
+ errln(Joiner.on('\t').join("Allowed subtag failure:", ltp, errors));
+ continue;
}
}
}
diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestLocale.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestLocale.java
index bd269f492ee..353a724fae5 100644
--- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestLocale.java
+++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestLocale.java
@@ -900,14 +900,6 @@ public void testLanguageTagParserIsValid() {
// likely subtags
- LocaleValidator.AllowedValid allow001 =
- new LocaleValidator.AllowedValid(
- null,
- LstrType.region,
- new LocaleValidator.AllowedMatch("001|419"),
- LstrType.language,
- new LocaleValidator.AllowedMatch("und|in|iw|ji|jw|mo|tl"));
-
Map exceptions =
Map.of(
// "und_QO", "Disallowed region=QO, status=macroregion"
@@ -918,13 +910,13 @@ public void testLanguageTagParserIsValid() {
final String value = entry.getValue();
String expected = CldrUtility.ifNull(exceptions.get(key), "");
- LocaleValidator.isValid(ltp.set(key), allow001, errors);
+ LocaleValidator.isValid(ltp.set(key), LocaleValidator.ALLOW_IN_LIKELY, errors);
assertEquals(key, expected, Joiner.on("; ").join(errors));
if (!expected.isEmpty()) {
warnln("Likely subtags, skipping " + ltp + ", " + expected);
}
- LocaleValidator.isValid(ltp.set(value), allow001, errors);
+ LocaleValidator.isValid(ltp.set(value), LocaleValidator.ALLOW_IN_LIKELY, errors);
assertEquals(value, "", Joiner.on("; ").join(errors));
}