Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLDR-17884 Regenerate AddPopulationData, ConvertLanguageData, reduce standard out noise #3965

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
783 changes: 398 additions & 385 deletions common/supplemental/supplementalData.xml

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,13 @@ static ArrayList<Pair<WBLine, Integer>> parseHeader(final String[] pieces) {
}
}

enum FBLine {
Rank,
Country,
enum FactbookLine {
CountryName,
CountrySlug,
Value,
Year;
DateOfInformation,
Ranking,
Region;

String get(String[] pieces) {
return pieces[ordinal()];
Expand Down Expand Up @@ -200,6 +202,10 @@ private static void showCountryData(String country) {
+ percent.format(getLiteracy(country) / 100));
}

/**
* Gets the percent of people that can read in a particular country. Values are in the range 0
* to 100
*/
public static Double getLiteracy(String country) {
return firstNonZero(
factbook_literacy.getCount(country),
Expand Down Expand Up @@ -275,16 +281,13 @@ private static void loadFactbookInfo(String filename, final Counter2<String> fac
new LineHandler() {
@Override
public boolean handle(String line) {
if (line.length() == 0
|| line.startsWith("This tab")
|| line.startsWith("Rank")
|| line.startsWith(" This file")) {
String[] pieces = splitCommaSeparated(line);
String countryName = FactbookLine.CountryName.get(pieces);
if (countryName.equals("name")) {
return false;
}
String[] pieces = line.split("\\s{2,}");
String code =
CountryCodeConverter.getCodeFromName(
FBLine.Country.get(pieces), true, missing);
CountryCodeConverter.getCodeFromName(countryName, true, missing);
if (code == null) {
return false;
}
Expand All @@ -295,7 +298,7 @@ public boolean handle(String line) {
return false;
}
code = code.toUpperCase(Locale.ENGLISH);
String valueString = FBLine.Value.get(pieces).trim();
String valueString = FactbookLine.Value.get(pieces).trim();
if (valueString.startsWith("$")) {
valueString = valueString.substring(1);
}
Expand Down Expand Up @@ -395,7 +398,9 @@ public boolean handle(String line) {
return false;
}
code = code.toUpperCase(Locale.ENGLISH);
String valueString = FBLiteracy.Percent.get(pieces).trim();
String valueString =
FBLiteracy.Percent.get(pieces)
.trim(); // Values are in the range 0 to 100
double percent = Double.parseDouble(valueString);
factbook_literacy.put(code, percent);
if (ADD_POP) {
Expand Down Expand Up @@ -521,7 +526,10 @@ static List<Pair<String, Double>> getUnLiteracy(Output<Boolean> hadErr) throws I
continue;
}
double total = literate + illiterate;
double percent = ((double) literate) / total;
double percent =
((double) literate)
* 100
/ total; // Multiply by 100 to put values in range 0 to 100
result.add(Pair.of(code, percent));
}
if (result.isEmpty()) {
Expand All @@ -535,8 +543,8 @@ static List<Pair<String, Double>> getUnLiteracy(Output<Boolean> hadErr) throws I
loadFactbookLiteracy();
loadUnLiteracy();

loadFactbookInfo("external/factbook_gdp_ppp.txt", factbook_gdp);
loadFactbookInfo("external/factbook_population.txt", factbook_population);
loadFactbookInfo("external/factbook_gdp_ppp.csv", factbook_gdp);
loadFactbookInfo("external/factbook_population.csv", factbook_population);
CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other));

loadWorldBankInfo();
Expand Down Expand Up @@ -577,7 +585,7 @@ static List<Pair<String, Double>> getUnLiteracy(Output<Boolean> hadErr) throws I
}
if (myErrors.length() != 0) {
throw new IllegalArgumentException(
"Missing Country values, the following and add to external/other_country_data to fix, chaning the 0 to the real value:"
"Missing Country values, the following and add to external/other_country_data to fix, changing the 0 to the real value:"
+ myErrors);
}
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,17 @@ private void handleRecord() {
throw new IllegalArgumentException(
"Inconsistent reliability " + reliability + " for " + thisRecord);
}
final Long old = pa.perLiteracy.put(literacy, getLongValue());
if (old != null) {
System.err.println("Duplicate record " + country + " " + year + " " + age);
final Long new_value = getLongValue();
final Long old_value = pa.perLiteracy.put(literacy, new_value);
if (old_value != null) {
// Suriname is known to include duplicate records, 1 normal and 1 "Excluding the
// institutional population"
// Resolve this by taking higher value
if (country.equals("Suriname")) {
pa.perLiteracy.put(literacy, Math.max(old_value, new_value));
} else {
System.err.println("Duplicate record " + country + " " + year + " " + age);
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,8 @@ Chad TD "15,833,116" 35% "28,620,000,000" official French fr 26% https://www.c
Chile CL "17,925,262" 99% "452,100,000,000" English en 9.5%
Chile CL "17,925,262" 99% "452,100,000,000" Mapuche arn "272,000" http://en.wikipedia.org/wiki/Mapuche_language
Chile CL "17,925,262" 99% "452,100,000,000" official Spanish es 98% "http://en.wikipedia.org/wiki/Demographics_of_Chile#Languages Spanish ""universal"", set to 98%"
China CN "1,384,688,986" 95% "23,210,000,000,000" Cantonese (Simplified) yue_Hans 5.2% 5% "Mainly in Guangdong Prov, ~70-80 million"
China CN "1,384,688,986" 95% "23,210,000,000,000" Cantonese (Simplified) yue_Hans 5.2% 5% "Mainly in Guangdong Prov, ~70-80 million. Script unspecified so both listed"
China CN "1,384,688,986" 95% "23,210,000,000,000" Cantonese (Traditional) yue 5.2% 5% "Mainly in Guangdong Prov, ~70-80 million. Script unspecified so both listed"
China CN "1,384,688,986" 95% "23,210,000,000,000" official Chinese zh 90%
China CN "1,384,688,986" 95% "23,210,000,000,000" English en "62,900"
China CN "1,384,688,986" 95% "23,210,000,000,000" Gan Chinese gan "22,900,000"
Expand Down Expand Up @@ -1114,7 +1115,7 @@ Russia RU "142,122,776" 100% "4,016,000,000,000" official_regional Erzya myv "43
Russia RU "142,122,776" 100% "4,016,000,000,000" Finnish fi "17,000"
Russia RU "142,122,776" 100% "4,016,000,000,000" Ingrian izh 120
Russia RU "142,122,776" 100% "4,016,000,000,000" official_regional Ingush inh "230,000"
Russia RU "142,122,776" 100% "4,016,000,000,000" Kara-Kalpak kaa 0.0006% https://joshuaproject.net/languages/kaa
Russia RU "142,122,776" 100% "4,016,000,000,000" Kara-Kalpak kaa 0.0006% https://joshuaproject.net/languages/kaa
Russia RU "142,122,776" 100% "4,016,000,000,000" official_regional Kabardian kbd "440,000"
Russia RU "142,122,776" 100% "4,016,000,000,000" official_regional Karachay-Balkar krc "235,000"
Russia RU "142,122,776" 100% "4,016,000,000,000" Karelian krl "117,000"
Expand Down Expand Up @@ -1467,7 +1468,7 @@ Unknown Region ZZ 0 0% 0 Novial nov 0 99% An artificial language. See http://
Unknown Region ZZ 0 0% 0 Toki Pona tok 800 https://en.wikipedia.org/wiki/Toki_Pona
Unknown Region ZZ 0 0% 0 Volapük vo 200 99% "http://en.wikipedia.org/wiki/Volap%C3%BCk Artificial: 'There are an estimated 20-30 Volapük speakers in the world today.'; see also http://www.villagevoice.com/arts/0031,lafarge,16942,12.html"
Uruguay UY "3,369,299" 98% "78,160,000,000" official Spanish es 88%
Uzbekistan UZ "36,799,756" 99% "223,000,000,000" Kara-Kalpak kaa 2.1% https://joshuaproject.net/languages/kaa
Uzbekistan UZ "36,799,756" 99% "223,000,000,000" Kara-Kalpak kaa 2.1% https://joshuaproject.net/languages/kaa
Uzbekistan UZ "36,799,756" 99% "223,000,000,000" Russian ru 14%
Uzbekistan UZ "36,799,756" 99% "223,000,000,000" Turkish tr "228,000"
Uzbekistan UZ "36,799,756" 99% "223,000,000,000" official Uzbek uz 85% "http://en.wikipedia.org/wiki/Uzbek_language#Writing_systems https://www.cia.gov/library/publications/the-world-factbook/geos/uz.html Latin/Cyrillic balance is estimated, based on literacy; younger education now in Latin"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,17 @@ SY; Syria; Syrian Arab Republic
SZ; Eswatini; eSwatini; Swaziland
SZ; Eswatini; Swaziland

SH; Saint Helena; Saint Helena
SH; Saint Helena; St. Helena
SH; Saint Helena; Saint Helena, Ascension, and Tristan da Cunha
SH; Saint Helena; Saint Helena, Ascension and Tristan da Cunha
SH; Saint Helena; Saint Helena ex. dep.

TL; East Timor; Timor-Leste
TL; East Timor; East Timor

TR; Turkey; Turkiye
TR; Turkey; Turkey (Turkiye)
TR; ; Turkey


Expand Down Expand Up @@ -198,11 +202,11 @@ RE; ; Reunion
PS; ; Palestinian Territory
CD; ; Congo, Democratic Republic
FX; ; France, Metropolitan
SH; ; St. Helena
SJ; ; Svalbard and Jan Mayen Islands
VA; ; Vatican
CW; ; Netherlands Antilles
WF; ; Wallis and Futuna Islands
WF; ; Wallis and Futuna
HM; ; Heard and McDonald Islands
PM; ; St. Pierre and Miquelon

Expand All @@ -220,34 +224,9 @@ UK;; U.K.
RS;; Yugoslavia
KM;; Comros

skip; skip; Arab World
skip; skip; Caribbean small states
skip; skip; Country Name
skip; skip; East Asia & Pacific (all income levels)
skip; skip; East Asia & Pacific (developing only)
skip; skip; Euro area
skip; skip; Europe & Central Asia (all income levels)
skip; skip; Europe & Central Asia (developing only)
skip; skip; Heavily indebted poor countries (HIPC)
skip; skip; High income
skip; skip; High income: nonOECD
skip; skip; High income: OECD
skip; skip; Latin America & Caribbean (all income levels)
skip; skip; Latin America & Caribbean (developing only)
skip; skip; Least developed countries: UN classification
skip; skip; Low & middle income
skip; skip; Low income
skip; skip; Lower middle income
skip; skip; Middle East & North Africa (all income levels)
skip; skip; Middle East & North Africa (developing only)
skip; skip; Middle income
skip; skip; OECD members
skip; skip; Other small states
skip; skip; Pacific island small states
skip; skip; Small states
skip; skip; South Asia
skip; skip; Sub-Saharan Africa (all income levels)
skip; skip; Sub-Saharan Africa (developing only)
skip; skip; Sudan (pre-secession)
skip; skip; Upper middle income
skip; skip; Paracel Islands
419; Latin America & Caribbean; Latin America & Caribbean
419; Latin America & Caribbean; Latin America & the Caribbean

# Many of the skipped values below are aggregates from world_bank_data that we can ignore since they don't correspond to UN country groups

skip; skip; Paracel Islands
Loading
Loading