From 6d404721a6ad657aaa22425ca96e65a27d6205be Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Tue, 10 Sep 2024 16:37:44 -0500 Subject: [PATCH 1/8] CLDR-16720 json: add transforms - new package cldr-transforms - add manifest file transforms.json at the top level - each transform has a metadata file (transforms/ID.json) and a raw text file (transforms/ID.txt). - metadata has all of the keys from the transform rule - the _rulesFile key formally indicates the textfile's name (in case we need to massage the id for some reason in the future). --- .../java/org/unicode/cldr/json/CldrNode.java | 10 +++- .../unicode/cldr/json/Ldml2JsonConverter.java | 59 +++++++++++++++++-- .../unicode/cldr/json/LdmlConvertRules.java | 9 ++- .../org/unicode/cldr/util/CLDRTransforms.java | 15 +++++ .../cldr/json/JSON_config_transforms.txt | 2 + 5 files changed, 87 insertions(+), 8 deletions(-) create mode 100644 tools/cldr-code/src/main/resources/org/unicode/cldr/json/JSON_config_transforms.txt diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/json/CldrNode.java b/tools/cldr-code/src/main/java/org/unicode/cldr/json/CldrNode.java index a6559730bcc..d272cee893d 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/json/CldrNode.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/json/CldrNode.java @@ -23,7 +23,15 @@ public static CldrNode createNode( String fullTrunk = extractAttrs(fullPathSegment, node.nondistinguishingAttributes); if (!node.name.equals(fullTrunk)) { throw new ParseException( - "Error in parsing \"" + pathSegment + " \":\"" + fullPathSegment, 0); + "Error in parsing \"" + + pathSegment + + "\":\"" + + fullPathSegment + + " - " + + node.name + + " != " + + fullTrunk, + 0); } for (String key : node.distinguishingAttributes.keySet()) { diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java b/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java index 35c413c1019..f8d5432b5b0 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java @@ -49,6 +49,7 @@ import org.unicode.cldr.util.CLDRLocale; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.CLDRTool; +import org.unicode.cldr.util.CLDRTransforms; import org.unicode.cldr.util.CLDRURLS; import org.unicode.cldr.util.CalculatedCoverageLevels; import org.unicode.cldr.util.CldrUtility; @@ -88,6 +89,7 @@ public class Ldml2JsonConverter { private static final String CLDR_PKG_PREFIX = "cldr-"; private static final String FULL_TIER_SUFFIX = "-full"; private static final String MODERN_TIER_SUFFIX = "-modern"; + private static final String TRANSFORM_RAW_SUFFIX = ".txt"; private static Logger logger = Logger.getLogger(Ldml2JsonConverter.class.getName()); enum RunType { @@ -98,7 +100,8 @@ enum RunType { rbnf(false, true), annotations, annotationsDerived, - bcp47(false, false); + bcp47(false, false), + transforms(false, false); private final boolean isTiered; private final boolean hasLocales; @@ -739,6 +742,8 @@ private int convertCldrItems( outFilename = filenameAsLangTag + ".json"; } else if (type == RunType.bcp47) { outFilename = filename + ".json"; + } else if (type == RunType.transforms) { + outFilename = filename + ".json"; } else if (js.section.equals("other")) { // If you see other-___.json, it means items that were missing from // JSON_config_*.txt @@ -775,11 +780,11 @@ private int convertCldrItems( if (type == RunType.main) { avl.full.add(filenameAsLangTag); } - } else if (type == RunType.rbnf) { - js.packageName = "rbnf"; - tier = ""; - } else if (type == RunType.bcp47) { - js.packageName = "bcp47"; + } else if (type == RunType.rbnf + || type == RunType.bcp47 + || type == RunType.transforms) { + // untiered, just use the name + js.packageName = type.name(); tier = ""; } if (js.packageName != null) { @@ -884,6 +889,28 @@ private int convertCldrItems( } } + if (item.getUntransformedPath() + .startsWith("//supplementalData/transforms")) { + // here, write the raw data + final String rawTransformFile = filename + TRANSFORM_RAW_SUFFIX; + try (PrintWriter outf = + FileUtilities.openUTF8Writer(outputDir, rawTransformFile)) { + outf.println(item.getValue()); + // note: not logging the write here- it will be logged when the + // .json file is written. + } + // the value is now the raw filename + item.setValue(rawTransformFile); + item.setPath( + item.getPath() + .replaceAll("\\]/tRule.*$", "]/_rulesFile") + .replace("/transforms/", "/")); + item.setFullPath( + item.getFullPath() + .replaceAll("\\]/tRule.*$", "]/_rulesFile") + .replace("/transforms/", "/")); + } + // some items need to be split to multiple item before processing. None // of those items need to be sorted. // Applies to SPLITTABLE_ATTRS attributes. @@ -1453,6 +1480,24 @@ public void writeDefaultContent(String outputDir) throws IOException { outf.close(); } + public void writeTransformMetadata(String outputDir) throws IOException { + final String dirName = outputDir + "/cldr-" + RunType.transforms.name(); + final String fileName = RunType.transforms.name() + ".json"; + PrintWriter outf = FileUtilities.openUTF8Writer(dirName, fileName); + System.out.println( + PACKAGE_ICON + + " Creating packaging file => " + + dirName + + File.separator + + fileName); + JsonObject obj = new JsonObject(); + obj.add( + RunType.transforms.name(), + gson.toJsonTree(CLDRTransforms.getInstance().getJsonIndex())); + outf.println(gson.toJson(obj)); + outf.close(); + } + public void writeCoverageLevels(String outputDir) throws IOException { try (PrintWriter outf = FileUtilities.openUTF8Writer(outputDir + "/cldr-core", "coverageLevels.json"); ) { @@ -2225,6 +2270,8 @@ public void processDirectory(String dirName, DraftStatus minimalDraftStatus) if (Boolean.parseBoolean(options.get("packagelist").getValue())) { writePackageList(outputDir); } + } else if (type == RunType.transforms) { + writeTransformMetadata(outputDir); } } } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/json/LdmlConvertRules.java b/tools/cldr-code/src/main/java/org/unicode/cldr/json/LdmlConvertRules.java index 7e890aa5052..d15e233e861 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/json/LdmlConvertRules.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/json/LdmlConvertRules.java @@ -154,7 +154,14 @@ class LdmlConvertRules { "identity:variant:type", // in common/bcp47/*.xml - "keyword:key:name"); + "keyword:key:name", + + // transforms + + // transforms + "transforms:transform:source", + "transforms:transform:target", + "transforms:transform:direction"); /** * The set of element:attribute pair in which the attribute should be treated as value. All the diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRTransforms.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRTransforms.java index 2bcee0f7dd9..0a0cea7db31 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRTransforms.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRTransforms.java @@ -1128,4 +1128,19 @@ static String parseDoubleColon(String x, Set others) { } return ""; } + + public class CLDRTransformsJsonIndex { + /** raw list of available IDs */ + public String[] available = + getAvailableIds().stream() + .map((String id) -> id.replace(".xml", "")) + .collect(Collectors.toList()) + .toArray(new String[0]); + } + + /** This gets the metadata (index file) exposed as cldr-json/cldr-transforms/transforms.json */ + public CLDRTransformsJsonIndex getJsonIndex() { + final CLDRTransformsJsonIndex index = new CLDRTransformsJsonIndex(); + return index; + } } diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/json/JSON_config_transforms.txt b/tools/cldr-code/src/main/resources/org/unicode/cldr/json/JSON_config_transforms.txt new file mode 100644 index 00000000000..9734f36fe6a --- /dev/null +++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/json/JSON_config_transforms.txt @@ -0,0 +1,2 @@ +section=transforms ; path=//cldr/supplemental/transforms/.* ; package=transforms ; packageDesc=Transform data +dependency=core ; package=transforms From 4750f88e5857ff6e7a4713af38b547a90a510632 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Tue, 10 Sep 2024 16:43:12 -0500 Subject: [PATCH 2/8] CLDR-16720 json: Update the release note --- docs/site/downloads/cldr-46.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/site/downloads/cldr-46.md b/docs/site/downloads/cldr-46.md index ab180e8a3dd..415bf5fe5fb 100644 --- a/docs/site/downloads/cldr-46.md +++ b/docs/site/downloads/cldr-46.md @@ -96,7 +96,7 @@ CLDR includes [data for sorting Han (CJK) characters in radical-stroke order](tr ### JSON Data Changes -**TBD** +- Transliteration (transform) data is now available in the `cldr-transforms` package. The JSON file contains transform metadata, and the `_rulesFile` key indicates an external (`.txt`) file containing the actual rules. [CLDR-17620][]. ### File Changes The following files were added: @@ -126,3 +126,5 @@ Many people have made significant contributions to CLDR and LDML; see the [Ackno The Unicode [Terms of Use](https://unicode.org/copyright.html) apply to CLDR data; in particular, see [Exhibit 1](https://unicode.org/copyright.html#Exhibit1). For web pages with different views of CLDR data, see [http://cldr.unicode.org/index/charts](https://cldr.unicode.org/index/charts). + +[CLDR-17620]: https://unicode-org.atlassian.net/issues/CLDR-17620 From 6ea2b87348b54b0ecd7609618996f8487441b6c8 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Tue, 17 Sep 2024 17:32:12 -0500 Subject: [PATCH 3/8] CLDR-16720 json transliterator update - properly use BCP47 for source/target - fix corruption in alias and slashes in output --- .../java/org/unicode/cldr/json/Ldml2JsonConverter.java | 4 +++- .../resources/org/unicode/cldr/json/pathTransforms.txt | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java b/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java index f8d5432b5b0..e5673cf6b12 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java @@ -514,7 +514,9 @@ private String transformPath(final String pathStr, final String pathPrefix) { if (!oldResult.equals(result)) { logger.fine(oldResult + " => " + result); } - } + } else if (result.startsWith("//cldr/transforms/transforms")) { + result = fixXpathBcp47(result, "transform", "source", "target"); + } } else if (result.contains("languages") || result.contains("languageAlias") || result.contains("languageMatches") diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/json/pathTransforms.txt b/tools/cldr-code/src/main/resources/org/unicode/cldr/json/pathTransforms.txt index 8457c44e3ac..da4557e7f8e 100644 --- a/tools/cldr-code/src/main/resources/org/unicode/cldr/json/pathTransforms.txt +++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/json/pathTransforms.txt @@ -130,10 +130,6 @@ < (.*(GMT|UTC).*/exemplarCity)(.*) > -# -< (.*/transforms/transform[^/]*)/(.*) -> $1/tRules/$2 - # < (.*)\[@territories="([^"]*)"\](.*)\[@alt="variant"\](.*) > $1\[@territories="$2-alt-variant"\] @@ -173,3 +169,7 @@ # ParentLocales < (.*/parentLocales)\[@component="([^"]*)"\]/(parentLocale)(.*)$ > $1/$2$4 + +# Transform - drop terminal tRule element +< (.*)/tRule.* +> $1 From f93472dee8f7969b011c20cb1537bde1c7a9f88b Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Tue, 17 Sep 2024 17:35:21 -0500 Subject: [PATCH 4/8] CLDR-16720 spotless --- .../src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java b/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java index e5673cf6b12..2ad95794fec 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java @@ -516,7 +516,7 @@ private String transformPath(final String pathStr, final String pathPrefix) { } } else if (result.startsWith("//cldr/transforms/transforms")) { result = fixXpathBcp47(result, "transform", "source", "target"); - } + } } else if (result.contains("languages") || result.contains("languageAlias") || result.contains("languageMatches") From 5fc8be56e48a37e176c43315edeaf48884cd6d34 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Tue, 17 Sep 2024 17:39:52 -0500 Subject: [PATCH 5/8] CLDR-16720 json transliterator update - back out bcp47 - broke some source/target ids --- .../src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java b/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java index 2ad95794fec..f8d5432b5b0 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java @@ -514,8 +514,6 @@ private String transformPath(final String pathStr, final String pathPrefix) { if (!oldResult.equals(result)) { logger.fine(oldResult + " => " + result); } - } else if (result.startsWith("//cldr/transforms/transforms")) { - result = fixXpathBcp47(result, "transform", "source", "target"); } } else if (result.contains("languages") || result.contains("languageAlias") From 24598406733fb76901aa98d973d0071692f491c4 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Tue, 17 Sep 2024 17:58:26 -0500 Subject: [PATCH 6/8] CLDR-16720 json transliterator- sort the transforms.json file --- .../src/main/java/org/unicode/cldr/util/CLDRTransforms.java | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRTransforms.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRTransforms.java index 0a0cea7db31..7f41cf3e577 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRTransforms.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRTransforms.java @@ -1134,6 +1134,7 @@ public class CLDRTransformsJsonIndex { public String[] available = getAvailableIds().stream() .map((String id) -> id.replace(".xml", "")) + .sorted() .collect(Collectors.toList()) .toArray(new String[0]); } From 054ab287aef767297d7112f1dacc75486e9060d0 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 19 Sep 2024 13:52:07 -0700 Subject: [PATCH 7/8] CLDR-16720 json transliterator- improve format - hoist json content up 2 levels - fix 'BGN' in path --- .../unicode/cldr/json/Ldml2JsonConverter.java | 34 ++++++++++++++----- .../org/unicode/cldr/json/pathTransforms.txt | 4 +-- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java b/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java index f8d5432b5b0..1093df5d03a 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java @@ -901,14 +901,6 @@ private int convertCldrItems( } // the value is now the raw filename item.setValue(rawTransformFile); - item.setPath( - item.getPath() - .replaceAll("\\]/tRule.*$", "]/_rulesFile") - .replace("/transforms/", "/")); - item.setFullPath( - item.getFullPath() - .replaceAll("\\]/tRule.*$", "]/_rulesFile") - .replace("/transforms/", "/")); } // some items need to be split to multiple item before processing. None @@ -970,7 +962,31 @@ private int convertCldrItems( outputUnitPreferenceData(js, theItems, out, nodesForLastItem); } - // closeNodes(out, nodesForLastItem.size() - 2, 0); + // Special processing for transforms. + if (type == RunType.transforms) { + final JsonObject jo = out.getAsJsonObject("transforms"); + if (jo == null || jo.isEmpty()) { + throw new RuntimeException( + "Could not get transforms object in " + filename); + } + @SuppressWarnings("unchecked") + final Entry[] s = jo.entrySet().toArray(new Entry[0]); + if (s == null || s.length != 1) { + throw new RuntimeException( + "Could not get 1 subelement of transforms in " + filename); + } + // key doesn't matter. + // move subitem up + out = s[0].getValue().getAsJsonObject(); + final Entry[] s2 = + out.entrySet().toArray(new Entry[0]); + if (s2 == null || s2.length != 1) { + throw new RuntimeException( + "Could not get 1 sub-subelement of transforms in " + filename); + } + // move sub-subitem up. + out = s2[0].getValue().getAsJsonObject(); + } // write JSON try (PrintWriter outf = FileUtilities.openUTF8Writer(outputDir, outFilename)) { diff --git a/tools/cldr-code/src/main/resources/org/unicode/cldr/json/pathTransforms.txt b/tools/cldr-code/src/main/resources/org/unicode/cldr/json/pathTransforms.txt index da4557e7f8e..6f97b92ce3f 100644 --- a/tools/cldr-code/src/main/resources/org/unicode/cldr/json/pathTransforms.txt +++ b/tools/cldr-code/src/main/resources/org/unicode/cldr/json/pathTransforms.txt @@ -171,5 +171,5 @@ > $1/$2$4 # Transform - drop terminal tRule element -< (.*)/tRule.* -> $1 +< //supplementalData/transforms/transform(.*)/tRule.*$ +> //supplementalData/transforms/transform$1/_rulesFile From fcbacb9c99adb5c513d4d292eb41e89eca75681a Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 20 Sep 2024 21:50:08 -0700 Subject: [PATCH 8/8] CLDR-16720 json transliterator- split out bcp47 aliases - split bcp47 and non-bcp47 aliases. --- .../unicode/cldr/json/Ldml2JsonConverter.java | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java b/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java index 1093df5d03a..50924b0cfb2 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/json/Ldml2JsonConverter.java @@ -23,6 +23,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -899,6 +900,10 @@ private int convertCldrItems( // note: not logging the write here- it will be logged when the // .json file is written. } + final String path = item.getPath(); + item.setPath(fixTransformPath(path)); + final String fullPath = item.getFullPath(); + item.setFullPath(fixTransformPath(fullPath)); // the value is now the raw filename item.setValue(rawTransformFile); } @@ -1033,6 +1038,51 @@ private int convertCldrItems( return totalItemsInFile; } + /** + * Fixup an XPathParts with a specific transform element + * + * @param xpp the XPathParts to modify + * @param attribute the attribute name, such as "alias" + */ + private static final void fixTransformPath(final XPathParts xpp, final String attribute) { + final String v = xpp.getAttributeValue(-2, attribute); // on penultimate element + if (v == null) return; + final Set aliases = new HashSet<>(); + final Set bcpAliases = new HashSet<>(); + for (final String s : v.split(" ")) { + final String q = Locale.forLanguageTag(s).toLanguageTag(); + if (s.equals(q)) { + // bcp47 round trips- add to bcp list + bcpAliases.add(s); + } else { + // different - add to other aliases. + aliases.add(s); + } + } + if (aliases.isEmpty()) { + xpp.removeAttribute(-2, attribute); + } else { + xpp.setAttribute(-2, attribute, String.join(" ", aliases.toArray(new String[0]))); + } + if (bcpAliases.isEmpty()) { + xpp.removeAttribute(-2, attribute + "Bcp47"); + } else { + xpp.setAttribute( + -2, attribute + "Bcp47", String.join(" ", bcpAliases.toArray(new String[0]))); + } + } + + /** + * Fixup a transform path, expanding the alias and backwardAlias into bcp47 and non-bcp47 + * attributes. + */ + private static final String fixTransformPath(final String path) { + final XPathParts xpp = XPathParts.getFrozenInstance(path).cloneAsThawed(); + fixTransformPath(xpp, "alias"); + fixTransformPath(xpp, "backwardAlias"); + return xpp.toString(); + } + private static String valueSectionsFormat(int values, int sections) { return MessageFormat.format( "({0, plural, one {# value} other {# values}} in {1, plural, one {# section} other {# sections}})",