Skip to content

Commit

Permalink
CLDR-17535 Cleaned up now, ready for review
Browse files Browse the repository at this point in the history
  • Loading branch information
macchiati committed Aug 23, 2024
1 parent 1f0b110 commit 042f5a0
Show file tree
Hide file tree
Showing 8 changed files with 152 additions and 78 deletions.
25 changes: 13 additions & 12 deletions common/supplemental/languageGroup.xml

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -279,13 +279,13 @@ public static void showDiff(String title, Set<String> oldMinusOther) {
}
}

static String show(String languageCode) {
return languageCode.equals("mul")
? "Ω"
: ENGLISH.getName(CLDRFile.LANGUAGE_NAME, languageCode).replace(" (Other)", "")
+ " ⁅"
+ languageCode
+ "⁆";
public static String show(String languageCode) {
return languageCode.equals("mul") ? "Ω" : getName(languageCode) + " ⁅" + languageCode + "⁆";
}

public static String getName(String languageCode) {
String result = ENGLISH.getName(CLDRFile.LANGUAGE_NAME, languageCode);
return result == null ? "(no name)" : result.replace(" (Other)", "");
}

public static void showErrors(String title, Multimap<String, String> oldErrors) {
Expand Down
28 changes: 21 additions & 7 deletions tools/cldr-rdf/external/RawLanguageContainment.txt

Large diffs are not rendered by default.

4 changes: 0 additions & 4 deletions tools/cldr-rdf/external/childToParent.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -2625,10 +2625,6 @@ http://www.wikidata.org/entity/Q3120736 http://www.wikidata.org/entity/Q1781084
http://www.wikidata.org/entity/Q3182573 http://www.wikidata.org/entity/Q1781084
http://www.wikidata.org/entity/Q3354339 http://www.wikidata.org/entity/Q1781084
http://www.wikidata.org/entity/Q56681 http://www.wikidata.org/entity/Q1781084
http://www.wikidata.org/entity/Q7124567 http://www.wikidata.org/entity/Q178440
http://www.wikidata.org/entity/Q27898 http://www.wikidata.org/entity/Q178440
http://www.wikidata.org/entity/Q33398 http://www.wikidata.org/entity/Q178440
http://www.wikidata.org/entity/Q56992 http://www.wikidata.org/entity/Q178440
http://www.wikidata.org/entity/Q5336735 http://www.wikidata.org/entity/Q1789745
http://www.wikidata.org/entity/Q34311 http://www.wikidata.org/entity/Q1789745
http://www.wikidata.org/entity/Q34311 http://www.wikidata.org/entity/Q33430
Expand Down
4 changes: 0 additions & 4 deletions tools/cldr-rdf/external/childToParentWithCodes.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ Aighon (aix) (aix, Q350428) Pasismanua (Q2052656)
Aikanã (tba) (tba, Q340930) Q5173 (Q5173)
Aiki (Q469696) Maban (Q3598)
Aiklep (mwg) (mwg, Q339965) Ngero–Vitiaz (Q306442)
Aimaq (aiq) (aiq, Q2789) Dari (prs) (prs, Q17844)
Aimele (ail) (ail, Q332741) Bosavi (Q494712)
Aimol (aim) (aim, Q469717) Kuki-Chin (Q83241)
Ainu (Q5011197) Altaic [Other] (tut) (tut, Q3784)
Expand Down Expand Up @@ -3123,7 +3122,6 @@ Hawaiian (haw) (haw, Q3356) Marquesic (Q313351)
Hawu-Dhao (Q3031728) Sumba–Hawu (Q255265)
Haya (hay) (hay, Q3575) Bantu [Other] (bnt) (bnt, Q3314)
Haya–Jita (Q2550236) Bantu [Other] (bnt) (bnt, Q3314)
Hazaragi (haz) (haz, Q3339) Dari (prs) (prs, Q17844)
Hdi (xed) (xed, Q5624) Biu–Mandara (Q225154)
Hebrew (he) (he, Q928) Canaanite (Q74754)
Hehe (heh) (heh, Q312939) Southern Tanzania Highlands Bantu (Q11652166)
Expand Down Expand Up @@ -6897,7 +6895,6 @@ Pagibete (pae) (pae, Q712435) Bati–Angba (Q486930)
Pagu (pgu) (pgu, Q712446) North Halmahera (Q321735)
Pahari (phj) (phj, Q11263595) Newaric (Q5562506)
Pahi (lgt) (lgt, Q712454) Tama (Q768051)
Pahlavani (phv) (phv, Q712456) Dari (prs) (prs, Q17844)
Pahlavi (pal) (pal, Q3206) Middle Iranian (Q684146)
Pahlavi (pal) (pal, Q3206) Southwestern Iranian (Q39042)
Pahoturi (Q1704914) Trans-Fly (Q4876817)
Expand Down Expand Up @@ -10704,7 +10701,6 @@ Tangko (tkx) (tkx, Q768299) Ok (Q708168)
Tanglang (ytl) (ytl, Q778669) Lisoish (Q655905)
Tanglapui (Q768299) Alor–Pantar (Q350242)
Tangoa (tgp) (tgp, Q241027) North and Central Vanuatu (Q307030)
Tangshewi (tnf) (tnf, Q5699) Dari (prs) (prs, Q17844)
Tanguat (tbs) (tbs, Q768316) Ataitan (Q481265)
Tangut (txg) (txg, Q272793) Qiangic (Q163676)
Tangwang (Q768317) Sinitic (Q3385)
Expand Down
2 changes: 0 additions & 2 deletions tools/cldr-rdf/external/entityToLabel.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -2638,7 +2638,6 @@ http://www.wikidata.org/entity/Q1775596 Ibanag
http://www.wikidata.org/entity/Q1776032 Hill Mari
http://www.wikidata.org/entity/Q1781084 Koman
http://www.wikidata.org/entity/Q1781533 Congo-Saharan
http://www.wikidata.org/entity/Q178440 Dari
http://www.wikidata.org/entity/Q1785111 Mohawk Dutch
http://www.wikidata.org/entity/Q178806 Middle Dutch
http://www.wikidata.org/entity/Q1789745 Yoruboid
Expand Down Expand Up @@ -3854,7 +3853,6 @@ http://www.wikidata.org/entity/Q2576407 Vestinian
http://www.wikidata.org/entity/Q2577228 Mysian
http://www.wikidata.org/entity/Q2578935 Danzhou dialect
http://www.wikidata.org/entity/Q2579500 Ometepec Náhuatl
http://www.wikidata.org/entity/Q25803500 Q25803500
http://www.wikidata.org/entity/Q25803511 Q25803511
http://www.wikidata.org/entity/Q25803706 Q25803706
http://www.wikidata.org/entity/Q25803802 Q25803802
Expand Down
6 changes: 0 additions & 6 deletions tools/cldr-rdf/external/skippingCodes.tsv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import com.google.common.collect.LinkedHashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Sets;
import com.google.common.collect.Sets.SetView;
import com.google.common.collect.SortedSetMultimap;
import com.google.common.collect.TreeMultimap;
import com.ibm.icu.impl.Row.R2;
Expand All @@ -35,10 +37,20 @@
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.rdf.QueryClient;
import org.unicode.cldr.rdf.TsvWriter;
import org.unicode.cldr.util.*;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.Containment;
import org.unicode.cldr.util.DiffLanguageGroups;
import org.unicode.cldr.util.DtdType;
import org.unicode.cldr.util.Iso639Data;
import org.unicode.cldr.util.Iso639Data.Type;
import org.unicode.cldr.util.LocaleNames;
import org.unicode.cldr.util.SimpleXMLSource;
import org.unicode.cldr.util.StandardCodes;
import org.unicode.cldr.util.StandardCodes.LstrField;
import org.unicode.cldr.util.StandardCodes.LstrType;
import org.unicode.cldr.util.Validity;
import org.unicode.cldr.util.Validity.Status;

/**
Expand Down Expand Up @@ -89,6 +101,7 @@ public class GenerateLanguageContainment {
private static final QueryClient queryClient = QueryClient.getInstance();

static final Splitter TAB = Splitter.on('\t').trimResults();
private static final Joiner JOIN_TAB = Joiner.on('\t');
static final CLDRFile ENGLISH = CONFIG.getEnglish();
static final String relDir = "../util/data/languages/";
static final Map<String, R2<List<String>, String>> ALIAS_MAP =
Expand Down Expand Up @@ -250,8 +263,16 @@ void add(List<String> chain) {
}

/** To add parent-child relations to Wikidata */
static final Multimap<String, String> EXTRA_PARENT_CHILDREN =
static final Multimap<String, String> RESET_PARENT_CHILDREN =
ImmutableMultimap.<String, String>builder()
.put(LocaleNames.MUL, LocaleNames.UND) // anomaly
.put(LocaleNames.MUL, "art")// no containing language family
.put(LocaleNames.MUL, "euq")// no containing language family
.put(LocaleNames.MUL, "jpx")// no containing language family
.put(LocaleNames.MUL, "tai")// no containing language family
.put(LocaleNames.MUL, "ko") // no containing language family (Altaic is too controversial)
.put(LocaleNames.MUL, "crp") // no containing language family
.put(LocaleNames.MUL, "kgp") // no containing language family
.put("alv", "agq")
.put("alv", "cch") // Atlantic–Congo <= cch [Atsam]
.put("alv", "kcg") // Atlantic–Congo <= kcg [Tyap]
Expand Down Expand Up @@ -288,26 +309,41 @@ void add(List<String> chain) {
.put("ira", "bgn") // Iranian <= Western Balochi
.put("inc", "trw") // Indo-Aryan <= Torwali
.put("jpx", "ja")
.put(LocaleNames.MUL, "art")
.put(LocaleNames.MUL, "euq")
.put(LocaleNames.MUL, "jpx")
.put(LocaleNames.MUL, "tai")
.put("ngb", "sg")
.put("roa", "cpf")
.put("roa", "cpp")
.put("roa", "cpp")
.put("sdv", "saq")
.put("son", "khq")
.put("sw", "swc")
.put("tai", "blt") // tai [Tai] <= blt [Tai Dam]
.put("tai", "lo")
.put("tai", "th")
.put("zlw", "szl") // West Slavic <= Silesian
// Restoring languages removed in 2024-08 wikidata
.put("inc", "ur") // Urdu is indic
.put("inc", "pa") // Punjabi is indic
.put("inc", "skr") // Saraiki is indic
.put("zls", "bs") // South Slavic (sh has problems)
.put("zls", "hr") // South Slavic (sh has problems)
.put("zls", "sr") // South Slavic (sh has problems)
.put("inc", "hi") // Indic
.put("inc", "kok") // Indic
.put("inc", "ks") // Indic
.put("inc", "mr") // Indic
.put("inc", "sd") // Indic
.put("cr", "csw") // Cree
.put("tai", "za") // Tai
.put("fiu", "hu") // Finno-Ugric
.put("alg", "cr") // Algonquin
.put("sit", "bo") // Sino-Tibetan
.put("poz", "mg") // Malayo-Polynesian languages
.put("esx", "iu") // Eskimo-Aleut languages
.put("esx", "kl") // Eskimo-Aleut languages
.build();

/**
* To remove parent-child relations from Wikidata, eg if a child has two parents (where that
* causes problems)
* causes problems). Don't do it if there is an explicit parent above.
*/
static final Multimap<String, String> REMOVE_PARENT_CHILDREN =
ImmutableMultimap.<String, String>builder()
Expand All @@ -321,20 +357,11 @@ void add(List<String> chain) {
// [Pitcairn-Norfolk]
.put("inc", "rmg")
// Indo-European
.put("ine", "el")
.put("ine", "gmy")
.put("ine", "grc")
.put("ine", "trw") // inc [Indic] <= trw [Torwali]
.put(LocaleNames.MUL, "crp")
.put(LocaleNames.MUL, "cpp") // Creoles and pidgins, Portuguese-based
.put(LocaleNames.MUL, LocaleNames.UND) // anomaly
.put("nic", "kcp") // ssa [Nilo-Saharan] <= kcp [Kanga]
.put("nic", "kec") // ssa [Nilo-Saharan] <= kec [Keiga]
.put("nic", "kgo") // ssa [Nilo-Saharan] <= kgo [Krongo]
.put("nic", "rof") // ssa [Nilo-Saharan] <= rof [Rombo]
.put("nic", "tbr") // ssa [Nilo-Saharan] <= tbr [Tumtum]
.put("nic", "tey") // ssa [Nilo-Saharan] <= tey [Tulishi]
.put("sit", "th") // sit <= tbq <= th
.put("sit", "dz") // sit <= tbq <= dz
.put("sit", "zh")
.put("sla", "cu")
Expand All @@ -343,6 +370,13 @@ void add(List<String> chain) {
// language called Pasi.
.build();

static {
// If a child is in RESET_PARENT_CHILDREN, it should not be in REMOVE_PARENT_CHILDREN
// That is because the RESET_PARENT_CHILDREN will cause the removal of any other parents anyway.
SetView<String> bad = Sets.intersection(Set.copyOf(RESET_PARENT_CHILDREN.values()), Set.copyOf(REMOVE_PARENT_CHILDREN.values()));
if (!bad.isEmpty()) System.err.println("Remove from REMOVE_PARENT_CHILDREN, child values: \"" + Joiner.on("\",\"").join(bad)
+ "\"");
}
public static void main(String[] args) throws IOException {
new GenerateLanguageContainment().run(args);
if (Containment.hadErrors) {
Expand Down Expand Up @@ -416,19 +450,69 @@ void run(String[] args) throws IOException {
// TsvWriter.writeRow(w, "childCode\tLabel", "parentCode\tLabel"); // header
skipping.forEach(e -> w.println(e));
}

for (Entry<String, Collection<String>> entity : REMOVE_PARENT_CHILDREN.asMap().entrySet()) {
String key = entity.getKey();
for (String value : entity.getValue()) {
if (value.equals("*")) {
_parentToChild.removeAll(key);

// preflight
DiffLanguageGroups.show("en");

Multimap<String, String> _childToParents = Multimaps.invertFrom(_parentToChild, TreeMultimap.create());

System.out.println("\nOVERRIDE Remove parent");
System.out.println("OVERRIDE\tParent\tChild\tNew Parents");
for (Entry<String, String> entry : REMOVE_PARENT_CHILDREN.entries()) {
final String parent = entry.getKey();
final String child = entry.getValue();
Set<String> oldChildren = _parentToChild.get(parent);
String type;
if (child.equals("*")) {
if(oldChildren == null) {
type = "No remove";
} else {
type = "Removing parent";
_parentToChild.removeAll(parent);
_childToParents = Multimaps.invertFrom(_parentToChild, TreeMultimap.create());
}
} else {
_parentToChild.remove(key, value);
if(oldChildren != null && oldChildren.contains(child)) {
_parentToChild.remove(parent, child);
_childToParents = Multimaps.invertFrom(_parentToChild, TreeMultimap.create());
type = "Removing parent";
} else {
type = "No remove";
}
}
}
System.out.println(JOIN_TAB.join(type, DiffLanguageGroups.show(parent), DiffLanguageGroups.show(child), _childToParents.get(child)
));
}

_parentToChild.putAll(EXTRA_PARENT_CHILDREN);
System.out.println("\nOVERRIDE Replace Parent");
System.out.println("OVERRIDE\tParent\tChild");
for (Entry<String, String> entry : RESET_PARENT_CHILDREN.entries()) {
final String parent = entry.getKey();
final String child = entry.getValue();
Set<String> oldValues = _parentToChild.get(parent);
Set<String> removals = new LinkedHashSet<>();

String type;
if(oldValues != null && oldValues.contains(child)) {
type = "Redundant add";
} else {
type = "Changing";
_parentToChild.put(parent,child);
_childToParents = Multimaps.invertFrom(_parentToChild, TreeMultimap.create());
Collection<String> newParents = _childToParents.get(child);
if (newParents.size() > 1) {
for (String parent2 : newParents) {
if (!parent2.equals(parent)) {
_parentToChild.remove(parent2,child);
removals.add(parent2);
}
// rebuild
_childToParents = Multimaps.invertFrom(_parentToChild, TreeMultimap.create());
}
}
}
System.out.println(JOIN_TAB.join(type,DiffLanguageGroups.show(parent),DiffLanguageGroups.show(child), _childToParents.get(child), removals));
}

// special code for artificial
for (String code : Iso639Data.getAvailable()) {
Expand Down Expand Up @@ -471,16 +555,7 @@ void run(String[] args) throws IOException {
// childNames + "\t" + parentNames);
// }
QUERY_HELPER.writeTsvs();

writeDifferences(parentToChild);
}

private void writeDifferences(Multimap<String, String> parentToChild) {
System.out.println("\nReading old supplemental: may have unrelated errors.");
final SupplementalDataInfo oldSupplementalInfo =
SupplementalDataInfo.getInstance(
CldrUtility.getPath(CLDRPaths.LAST_COMMON_DIRECTORY, "supplemental/"));
oldSupplementalInfo.contain
DiffLanguageGroups.main(new String[] {});
}

private static void showEntityLists(String title, Set<List<String>> ancestors) {
Expand Down

0 comments on commit 042f5a0

Please sign in to comment.