Skip to content

Commit

Permalink
CLDR-17884 Regex match
Browse files Browse the repository at this point in the history
  • Loading branch information
conradarcturus committed Aug 14, 2024
1 parent 02fa914 commit c9901a4
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
import com.ibm.icu.util.ULocale;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.Set;
import java.util.TreeMap;
import org.unicode.cldr.util.CldrUtility;
Expand All @@ -21,6 +23,7 @@ public class CountryCodeConverter {
private static Map<String, String> nameToCountryCode =
new TreeMap<>(new UTF16.StringComparator(true, true, 0));
private static Set<String> parseErrors = new LinkedHashSet<>();
private static Map<Pattern, String> namePatternToCountryCode = new HashMap<>();

public static String getCodeFromName(String display, boolean showMissing) {
return getCodeFromName(display, showMissing, null);
Expand All @@ -46,18 +49,20 @@ public static String getCodeFromName(String display, boolean showMissing, Set<St
if (reverseCommaTrial != null) {
result = nameToCountryCode.get(reverseCommaTrial);
// if (result != null) {
// addName(trial, result);
// addNameToCountry(trial, result);
// }
}
}

// Try to see if there's a form without parentheses, eg.
// Bolivia (Plurinational State of) -> Bolivia
// Middle East & North Africa (developing only) -> Middle East & North Africa
// If we haven't match any explicit matches, try the regex patterns. Examples:
// Bolivia (Plurinational State of) -> Bolivia
// "Middle East & North Africa (developing only)" matches /Middle East & North Africa (.*)/
if (result == null) {
String trialWithoutParentheses = strRemoveParentheses(trial);
if (trialWithoutParentheses != null) {
result = nameToCountryCode.get(trialWithoutParentheses);
for (Map.Entry<Pattern, String> pair : namePatternToCountryCode.entrySet()) {
if (pair.getKey().matcher(trial).matches()) {
result = pair.getValue();
continue;
}
}
}

Expand All @@ -74,7 +79,7 @@ public static String getCodeFromName(String display, boolean showMissing, Set<St
+ ".\n"
+ "To fix: add to external/alternate_country_names.txt a line such as:\n"
+ "\t<code>;\t<name>;\t"
+ display + " " + strRemoveParentheses(display));
+ display);
if (missing != null) {
missing.add(display);
}
Expand All @@ -96,16 +101,6 @@ private static String reverseComma(String display) {
return trial;
}

private static String strRemoveParentheses(String display) {
String trial;
trial = null;
int startingParenthesis = display.indexOf('(');
if (startingParenthesis >= 0) {
trial = display.substring(0, startingParenthesis).trim();
}
return trial;
}

static {
try {
loadNames();
Expand All @@ -116,15 +111,15 @@ private static String strRemoveParentheses(String display) {

static void loadNames() throws IOException {
for (String country : ULocale.getISOCountries()) {
addName(ULocale.getDisplayCountry("und-" + country, "en"), country);
addNameToCountry(ULocale.getDisplayCountry("und-" + country, "en"), country);
}
StandardCodes sc = StandardCodes.make();
Set<String> goodAvailableCodes = sc.getGoodAvailableCodes("territory");

for (String country : goodAvailableCodes) {
String description = sc.getFullData("territory", country).get(0);
if (country.equals("057")) continue;
addName(description, country);
addNameToCountry(description, country);
}
CldrUtility.handleFile(
"external/alternate_country_names.txt", new MyHandler(goodAvailableCodes));
Expand All @@ -151,25 +146,33 @@ public boolean handle(String line) {
}
// Note: field 1 is ignored.

addName(pieces[2].trim(), country);
addNameToCountry(pieces[2].trim(), country);
return true;
}
}

static void addName(String key, String code) {
addName2(key, code);
static void addNameToCountry(String key, String code) {
addNameToCountryImpl(key, code);
String trial = reverseComma(key);
if (trial != null) {
addName2(trial, code);
addNameToCountryImpl(trial, code);
}
}

private static void addName2(String key, String code) {
private static void addNameToCountryImpl(String key, String code) {
key = key.toLowerCase(Locale.ENGLISH);

// Allow regular expression patterns for finding locales
if (key.startsWith("/") && key.endsWith("/")) {
namePatternToCountryCode.put(Pattern.compile(key.substring(1, key.length() - 1)), code);
return;
}

// Otherwise add to the regular lookup table
String old = nameToCountryCode.get(key);
if (key.startsWith("the ")) {
key = key.substring(4);
}
String old = nameToCountryCode.get(key);
if (old != null && !code.equals(old)) {
System.err.println("Conflict!!" + key + "\t" + old + "\t" + code);
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -220,40 +220,32 @@ UK;; U.K.
RS;; Yugoslavia
KM;; Comros

skip; skip; Africa Eastern and Southern
skip; skip; Africa Western and Central
419; Latin America & Caribbean; Latin America & Caribbean
419; Latin America & Caribbean; Latin America & the Caribbean

skip; skip; Arab World
skip; skip; Caribbean small states
skip; skip; Central Europe and the Baltics
skip; skip; Country Name
skip; skip; East Asia & Pacific
skip; skip; Euro area
skip; skip; Europe & Central Asia
skip; skip; Fragile and conflict affected situations
skip; skip; Heavily indebted poor countries
skip; skip; High income
skip; skip; High income: nonOECD
skip; skip; High income: OECD
skip; skip; IBRD only
skip; skip; IDA & IBRD total
skip; skip; IDA blend
skip; skip; IDA only
skip; skip; IDA total
skip; skip; Latin America & Caribbean
skip; skip; Latin America & the Caribbean
skip; skip; Heavily indebted poor countries (HIPC)
skip; skip; Least developed countries: UN classification
skip; skip; Low & middle income
skip; skip; Low income
skip; skip; Lower middle income
skip; skip; Middle East & North Africa
skip; skip; Middle income
skip; skip; OECD members
skip; skip; Other small states
skip; skip; Pacific island small states
skip; skip; Paracel Islands
skip; skip; /.* dividend/
skip; skip; Small states
skip; skip; South Asia
skip; skip; Sub-Saharan Africa
skip; skip; Sudan (pre-secession)
skip; skip; Upper middle income

skip; skip; /.* dividend/
skip; skip; /.* income(: .*)?/
skip; skip; /(IBRD|IDA) .*/
skip; skip; /Africa Eastern and Southern.*/
skip; skip; /Africa Western and Central.*/
skip; skip; /East Asia & Pacific.*/
skip; skip; /Europe & Central Asia.*/
skip; skip; /Latin America &( the)? Caribbean \(.*\)/
skip; skip; /Middle East & North Africa.*/
skip; skip; /South Asia.*/
skip; skip; /Sub-Saharan Africa .*/

0 comments on commit c9901a4

Please sign in to comment.