From d64790d0ac44433f684e734197aea0daf524c960 Mon Sep 17 00:00:00 2001 From: Conrad Nied Date: Wed, 18 Sep 2024 10:21:35 -0700 Subject: [PATCH] CLDR-17897 Make ConvertLanguageData Consistent (#4015) If we re-run ConvertLanguageData on unrelated data, it will update the order and values of some other data -- this fixes inconsistencies with the XML outputs to match expectations. The biggest change was updating values in `language_script.tsv` to demote script variations to secondary when they really are not expected. Furthermore I added explicit annotations to `country_language_population.tsv` when the writing system for a country was a variant. Scripts ran: mvn package -DskipTests=true java -jar tools/cldr-code/target/cldr-code.jar ConvertLanguageData java -jar tools/cldr-code/target/cldr-code.jar GenerateLikelySubtags --- common/supplemental/likelySubtags.xml | 11 ++++++ common/supplemental/supplementalData.xml | 35 +++++++++++++------ .../cldr/tool/GenerateLikelySubtags.java | 11 +++--- .../util/data/country_language_population.tsv | 11 +++--- .../cldr/util/data/language_script.tsv | 17 +++++---- 5 files changed, 58 insertions(+), 27 deletions(-) diff --git a/common/supplemental/likelySubtags.xml b/common/supplemental/likelySubtags.xml index 3dc3523d626..ce4e105a831 100644 --- a/common/supplemental/likelySubtags.xml +++ b/common/supplemental/likelySubtags.xml @@ -261,6 +261,8 @@ not be patched by hand, as any changes made in that fashion may be lost. + + @@ -434,6 +436,8 @@ not be patched by hand, as any changes made in that fashion may be lost. + + @@ -498,6 +502,8 @@ not be patched by hand, as any changes made in that fashion may be lost. + + @@ -567,6 +573,10 @@ not be patched by hand, as any changes made in that fashion may be lost. + + + + @@ -1036,6 +1046,7 @@ not be patched by hand, as any changes made in that fashion may be lost. + diff --git a/common/supplemental/supplementalData.xml b/common/supplemental/supplementalData.xml index 65b2d6819ea..5d09af73a59 100644 --- a/common/supplemental/supplementalData.xml +++ b/common/supplemental/supplementalData.xml @@ -1647,7 +1647,7 @@ XXX Code for transations where no currency is involved - + @@ -1890,7 +1890,8 @@ XXX Code for transations where no currency is involved - + + @@ -1979,7 +1980,7 @@ XXX Code for transations where no currency is involved - + @@ -2070,14 +2071,15 @@ XXX Code for transations where no currency is involved - + - + + @@ -2277,11 +2279,12 @@ XXX Code for transations where no currency is involved - + + - - + + @@ -2309,8 +2312,8 @@ XXX Code for transations where no currency is involved - - + + @@ -3072,6 +3075,7 @@ XXX Code for transations where no currency is involved + @@ -3999,6 +4003,7 @@ XXX Code for transations where no currency is involved + @@ -4226,7 +4231,6 @@ XXX Code for transations where no currency is involved - @@ -4235,11 +4239,13 @@ XXX Code for transations where no currency is involved + + @@ -4257,6 +4263,8 @@ XXX Code for transations where no currency is involved + + @@ -5692,6 +5700,7 @@ XXX Code for transations where no currency is involved This is base pop for """"""""""""""""""""""""""""""""fub"""""""""""""""""""""""""""""""" lang code; ff shows as a macrolanguage [missing] (could be higher if 2nd lang included; no data yet) + [missing] [missing] [missing] pop 7k. Figure is questionable writing pop artificially set to 5% see also http://en.wikipedia.org/wiki/Lower_Sorbian @@ -5805,5 +5814,9 @@ XXX Code for transations where no currency is involved Analyzed from 2011 UK census and other sources In total 86.2% of Canadians have working knowledge of English while 29.8% have a working knowledge of French. 2014 Maldives: 98% literacy in Divehi, 75% in English + [missing] + [missing] + Greek population in Russia -- most ancestrally used Pontic Greek -- modern usage almost certainly has dropped off but we don't have clear statistics on current usage. + [missing] diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java index 44dfa981b8b..b14bfbb81d3 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateLikelySubtags.java @@ -392,6 +392,10 @@ public static void main(String[] args) throws IOException { {"mro", "mro_Mroo_BD"}, {"mro_BD", "mro_Mroo_BD"}, {"ms_Arab", "ms_Arab_MY"}, + {"nan", "nan_Hans_CN"}, + {"nan_Hant", "nan_Hant_TW"}, + {"nan_Hans", "nan_Hans_CN"}, + {"nan_TW", "nan_Hant_TW"}, {"pap", "pap_Latn_CW"}, {"pap_Latn", "pap_Latn_CW"}, { @@ -469,14 +473,9 @@ public static void main(String[] args) throws IOException { // {"cr", "cr_Cans_CA"}, // {"hif", "hif_Latn_FJ"}, // {"gon", "gon_Telu_IN"}, - // {"lzz", "lzz_Latn_TR"}, // {"lif", "lif_Deva_NP"}, // {"unx", "unx_Beng_IN"}, // {"unr", "unr_Beng_IN"}, - // {"ttt", "ttt_Latn_AZ"}, - // {"pnt", "pnt_Grek_GR"}, - // {"tly", "tly_Latn_AZ"}, - // {"tkr", "tkr_Latn_AZ"}, // {"bsq", "bsq_Bass_LR"}, // {"ccp", "ccp_Cakm_BD"}, // {"blt", "blt_Tavt_VN"}, @@ -505,6 +504,7 @@ public static void main(String[] args) throws IOException { // additions for missing values from LikelySubtagsText {"und_Arab_AF", "fa_Arab_AF"}, + {"und_Arab_AZ", "az_Arab_AZ"}, {"und_Cyrl_BG", "bg_Cyrl_BG"}, {"und_Tibt_BT", "dz_Tibt_BT"}, {"und_Cyrl_BY", "be_Cyrl_BY"}, @@ -518,6 +518,7 @@ public static void main(String[] args) throws IOException { {"und_Cyrl_RS", "sr_Cyrl_RS"}, {"und_Cyrl_TJ", "tg_Cyrl_TJ"}, {"und_Cyrl_UA", "uk_Cyrl_UA"}, + {"und_Hans_TW", "zh_Hans_TW"}, {"arc_Hatr", "arc_Hatr_IQ"}, {"hnj_Hmng", "hnj_Hmng_LA"}, {"bap_Krai", "bap_Krai_IN"}, diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv index 64c5e34ffd8..1c423b54848 100644 --- a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv +++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/country_language_population.tsv @@ -253,10 +253,10 @@ Chile CL "17,925,262" 99% "452,100,000,000" Mapuche arn "272,000" http://en.w Chile CL "17,925,262" 99% "452,100,000,000" official Spanish es 98% "http://en.wikipedia.org/wiki/Demographics_of_Chile#Languages Spanish ""universal"", set to 98%" China CN "1,384,688,986" 95% "23,210,000,000,000" Cantonese (Simplified) yue_Hans 5.2% 5% "Mainly in Guangdong Prov, ~70-80 million. Script unspecified so both listed" China CN "1,384,688,986" 95% "23,210,000,000,000" Cantonese (Traditional) yue 5.2% 5% "Mainly in Guangdong Prov, ~70-80 million. Script unspecified so both listed" -China CN "1,384,688,986" 95% "23,210,000,000,000" official Chinese zh 90% +China CN "1,384,688,986" 95% "23,210,000,000,000" official Chinese zh_Hans 90% China CN "1,384,688,986" 95% "23,210,000,000,000" English en "62,900" China CN "1,384,688,986" 95% "23,210,000,000,000" Gan Chinese gan "22,900,000" -China CN "1,384,688,986" 95% "23,210,000,000,000" Hakka Chinese hak "31,200,000" +China CN "1,384,688,986" 95% "23,210,000,000,000" Hakka Chinese hak_Hans "31,200,000" China CN "1,384,688,986" 95% "23,210,000,000,000" Kazakh (Arabic) kk_Arab "1,180,000" http://en.wikipedia.org/wiki/Kazakh_language China CN "1,384,688,986" 95% "23,210,000,000,000" official_regional Korean ko "2,040,000" China CN "1,384,688,986" 95% "23,210,000,000,000" Kyrgyz (Arabic) ky_Arab "466,000" @@ -435,6 +435,7 @@ Georgia GE "4,926,087" 100% "39,850,000,000" Kurdish ku "43,600" Georgia GE "4,926,087" 100% "39,850,000,000" Mingrelian xmf 11% Georgia GE "4,926,087" 100% "39,850,000,000" official_regional Ossetic os "109,000" Georgia GE "4,926,087" 100% "39,850,000,000" Russian ru 9% https://www.cia.gov/cia/publications/factbook/fields/2098.html +Georgia GE "4,926,087" 100% "39,850,000,000" Laz lzz_Geor 100 https://en.wikipedia.org/wiki/Laz_people#cite_note-ethnologue-1 Germany DE "80,457,737" 99% "4,199,000,000,000" Bavarian bar 17% 5% "https://en.wikipedia.org/wiki/Bavarian_language Widely spoken less written, and most speakers know standard German as well" Germany DE "80,457,737" 99% "4,199,000,000,000" Colognian ksh "245,000" http://www.ethnologue.com/show_language.asp?code=ksh Germany DE "80,457,737" 99% "4,199,000,000,000" Croatian hr "635,000" @@ -1128,6 +1129,7 @@ Russia RU "142,122,776" 100% "4,016,000,000,000" Mari chm "522,000" http://ww Russia RU "142,122,776" 100% "4,016,000,000,000" official_regional Moksha mdf "296,000" Russia RU "142,122,776" 100% "4,016,000,000,000" Mongolian mn "2,100" Russia RU "142,122,776" 100% "4,016,000,000,000" Ossetic os "456,000" http://www.gks.ru/free_doc/new_site/population/demo/per-itog/tab6.xls census data +Russia RU "142,122,776" 100% "4,016,000,000,000" Pontic pnt_Cyrl "56,168" https://en.wikipedia.org/wiki/Greeks_in_Russia_and_Ukraine#cite_ref-15 Greek population in Russia -- most ancestrally used Pontic Greek -- modern usage almost certainly has dropped off but we don't have clear statistics on current usage. Russia RU "142,122,776" 100% "4,016,000,000,000" official Russian ru 94% http://en.wikipedia.org/wiki/Russian_language (94% of studends in Russia receive primarily Russian-language ed) Russia RU "142,122,776" 100% "4,016,000,000,000" official_regional Sakha sah "461,000" http://en.wikipedia.org/wiki/Sakha_language Also called Sakha. Russia RU "142,122,776" 100% "4,016,000,000,000" Serbian (Latin) sr_Latn "5,000" @@ -1290,8 +1292,8 @@ Syria SY "19,454,263" 84% "50,280,000,000" Kurdish ku 8% Syria SY "19,454,263" 84% "50,280,000,000" Syriac syr "16,400" 5% "For languages not customarily written, the writing population is artificially set to 5% in the absence of better information." Syria SY "19,454,263" 84% "50,280,000,000" Levantine Arabic apc "16,633,300" https://en.wikipedia.org/wiki/Levantine_Arabic#Speakers_by_country Taiwan TW "23,545,963" 96% "1,189,000,000,000" official Chinese (Traditional) zh_Hant 95% -Taiwan TW "23,545,963" 96% "1,189,000,000,000" official Hokkien nan "13,500,000" https://en.wikipedia.org/wiki/Taiwanese_Hokkien -Taiwan TW "23,545,963" 96% "1,189,000,000,000" official Hakka hak "2,580,000" https://en.wikipedia.org/wiki/Taiwanese_Hakka +Taiwan TW "23,545,963" 96% "1,189,000,000,000" official Hokkien nan_Hant "13,500,000" https://en.wikipedia.org/wiki/Taiwanese_Hokkien +Taiwan TW "23,545,963" 96% "1,189,000,000,000" official Hakka hak_Hant "2,580,000" https://en.wikipedia.org/wiki/Taiwanese_Hakka Taiwan TW "23,545,963" 96% "1,189,000,000,000" Taroko trv "4,750" Tajikistan TJ "8,604,882" 100% "28,430,000,000" Arabic ar "1,000" Tajikistan TJ "8,604,882" 100% "28,430,000,000" Persian fa "66,900" @@ -1362,6 +1364,7 @@ Turkey TR "81,257,239" 94% "2,186,000,000,000" Kirmanjki kiu "158,000" Turkey TR "81,257,239" 94% "2,186,000,000,000" Kurdish ku 5.5% Turkey TR "81,257,239" 94% "2,186,000,000,000" Kyrgyz (Latin) ky_Latn "1,140" Turkey TR "81,257,239" 94% "2,186,000,000,000" Laz lzz "22,600" +Turkey TR "81,257,239" 94% "2,186,000,000,000" Pontic pnt_Latn "5,100" https://joshuaproject.net/people_groups/14444/TU Turkey TR "81,257,239" 94% "2,186,000,000,000" Serbian (Latin) sr_Latn "22,700" 5% "For languages not customarily written, the writing population is artificially set to 5% in the absence of better information." Turkey TR "81,257,239" 94% "2,186,000,000,000" official Turkish tr 93% "http://en.wikipedia.org/wiki/Turkish_language#Geographic_distribution http://ec.europa.eu/public_opinion/archives/ebs/ebs_243_en.pdf Europeans and their languages survey, page 7" Turkey TR "81,257,239" 94% "2,186,000,000,000" Turoyo tru "3,000" diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/language_script.tsv b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/language_script.tsv index 19e8da361b2..c8eabb9e545 100644 --- a/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/language_script.tsv +++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/language_script.tsv @@ -297,6 +297,7 @@ ha Hausa primary Arab Arabic ha Hausa primary Latn Latin hai Haida primary Latn Latin hak Hakka Chinese primary Hans Simplified +hak Hakka Chinese primary Hant Traditional haw Hawaiian primary Latn Latin haz Hazaragi primary Arab Arabic he Hebrew primary Hebr Hebrew @@ -492,8 +493,8 @@ luz Southern Luri primary Arab Arabic lv Latvian primary Latn Latin lwl Eastern Lawa primary Thai Thai lzh Literary Chinese secondary Hans Simplified -lzz Laz primary Geor Georgian lzz Laz primary Latn Latin +lzz Laz secondary Geor Georgian mad Madurese primary Latn Latin maf Mafa primary Latn Latin mag Magahi primary Deva Devanagari @@ -564,6 +565,7 @@ myz Classical Mandaic secondary Mand Mandaean mzn Mazanderani primary Arab Arabic na Nauru primary Latn Latin nan Min Nan Chinese primary Hans Simplified +nan Min Nan Chinese primary Hant Traditional nap Neapolitan primary Latn Latin naq Nama primary Latn Latin nb Norwegian (Bokmål) primary Latn Latin @@ -633,6 +635,7 @@ pdt Plautdietsch primary Latn Latin peo Old Persian secondary Xpeo Old Persian pfl Palatine German primary Latn Latin phn Phoenician secondary Phnx Phoenician +pi Pali primary Mymr Myanmar pi Pali secondary Deva Devanagari pi Pali secondary Sinh Sinhala pi Pali secondary Thai Thai @@ -640,9 +643,9 @@ pis Pijin primary Latn Latin pko Pökoot primary Latn Latin pl Polish primary Latn Latin pms Piedmontese primary Latn Latin -pnt Pontic primary Cyrl Cyrillic pnt Pontic primary Grek Greek -pnt Pontic primary Latn Latin +pnt Pontic secondary Cyrl Cyrillic +pnt Pontic secondary Latn Latin pon Pohnpeian primary Latn Latin pqm Malecite primary Latn Latin prd Parsi-Dari primary Arab Arabic @@ -815,13 +818,13 @@ tk Turkmen primary Arab Arabic tk Turkmen primary Cyrl Cyrillic tk Turkmen primary Latn Latin tkl Tokelau primary Latn Latin -tkr Tsakhur primary Cyrl Cyrillic tkr Tsakhur primary Latn Latin +tkr Tsakhur secondary Cyrl Cyrillic tkt Kathoriya Tharu primary Deva Devanagari tli Tlingit primary Latn Latin -tly Talysh primary Arab Arabic -tly Talysh primary Cyrl Cyrillic tly Talysh primary Latn Latin +tly Talysh secondary Arab Arabic +tly Talysh secondary Cyrl Cyrillic tmh Tamashek primary Latn Latin tn Tswana primary Latn Latin to Tongan primary Latn Latin @@ -842,9 +845,9 @@ tsj Tshangla primary Tibt Tibetan tt Tatar primary Cyrl Cyrillic ttj Tooro primary Latn Latin tts Northeastern Thai primary Thai Thai -ttt Muslim Tat primary Cyrl Cyrillic ttt Muslim Tat primary Latn Latin ttt Muslim Tat secondary Arab Arabic +ttt Muslim Tat secondary Cyrl Cyrillic tum Tumbuka primary Latn Latin tvl Tuvalu primary Latn Latin twq Tasawaq primary Latn Latin