From 8459b0b27f09eeec411252d214cb7e2fd47fdd45 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 4 Dec 2024 18:39:03 +0100 Subject: [PATCH] Generate old monkeys (#979) * An extra file * divergent syntaxes * spots --- .../unicode/text/UCD/GenerateBreakTest.java | 34 ++++ .../java/org/unicode/tools/Segmenter.java | 147 +++++++++++------- 2 files changed, 122 insertions(+), 59 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java index 81b29c395..651ad8ded 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java @@ -60,6 +60,7 @@ public abstract class GenerateBreakTest implements UCD_Types { Normalizer nfd; Normalizer nfkd; + Segmenter segmenter; UnicodeMap partition; UnicodeProperty prop; @@ -322,6 +323,7 @@ public void run() throws IOException { boolean forCLDR = seg.target == Segmenter.Target.FOR_CLDR; String path = "UCD/" + ucd.getVersion() + '/' + (forCLDR ? "cldr/" : "auxiliary/"); + String extraPath = "UCD/" + ucd.getVersion() + "/extra/"; String outFilename = fileName + "BreakTest"; if (forCLDR) { outFilename = outFilename + "-cldr"; @@ -477,6 +479,37 @@ value, new ParsePosition(0), IUP.getXSymbolTable()))) { fc.close(); generateTest(false, path, outFilename, propertyName); + generateCppOldMonkeys(extraPath, outFilename); + } + + private void generateCppOldMonkeys(String path, String outFilename) throws IOException { + final UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(path, outFilename + ".cpp"); + final PrintWriter out = fc.out; + out.println(); + out.println("####### Instructions ##################################"); + out.println("# Copy the following lines into rbbitst.cpp in ICU4C, #"); + out.println( + "# in the constructor of RBBIMeowMonkey, replacing the #" + .replace("Meow", outFilename.substring(0, 4).replace("Graph", "Char"))); + out.println("# existing block of generated code. #"); + out.println("#######################################################"); + out.println(); + out.println(" // --- NOLI ME TANGERE ---"); + out.println(" // Generated by GenerateBreakTest.java in the Unicode tools."); + for (Segmenter.Builder.NamedRefinedSet part : segmenter.getPartitionDefinition()) { + out.println( + " partition.emplace_back(\"" + + part.getName() + + "\", UnicodeSet(uR\"(" + + part.getDefinition() + + ")\", status));"); + } + out.println(); + for (Segmenter.SegmentationRule rule : segmenter.getRules()) { + out.println(" rules.push_back(" + rule.toCppOldMonkeyString() + ");"); + } + out.println(" // --- End of generated code. ---"); + fc.close(); } private void generateTest( @@ -1091,6 +1124,7 @@ public XGenerateBreakTest( } variables = segBuilder.getVariables(); collectingRules = false; + segmenter = seg; partition = seg.getSamples(); fileName = filename; propertyName = (filename.equals("Grapheme") ? "Grapheme_Cluster" : fileName) + "_Break"; diff --git a/unicodetools/src/main/java/org/unicode/tools/Segmenter.java b/unicodetools/src/main/java/org/unicode/tools/Segmenter.java index ebf22c336..261787838 100644 --- a/unicodetools/src/main/java/org/unicode/tools/Segmenter.java +++ b/unicodetools/src/main/java/org/unicode/tools/Segmenter.java @@ -17,7 +17,6 @@ import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSet.SpanCondition; -import com.ibm.icu.text.UnicodeSet.XSymbolTable; import com.ibm.icu.text.UnicodeSetIterator; import com.ibm.icu.util.ULocale; import java.text.ParsePosition; @@ -36,7 +35,9 @@ import java.util.stream.Collectors; import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.TransliteratorUtilities; +import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UnicodeProperty; +import org.unicode.tools.Segmenter.Builder.NamedRefinedSet; import org.unicode.tools.Segmenter.SegmentationRule.Breaks; /** Ordered list of rules, with variables resolved before building. Use Builder to make. */ @@ -68,6 +69,7 @@ public enum Target { public final Target target; private UnicodeMap samples = new UnicodeMap(); + private List partitionDefinition = new ArrayList<>(); private Segmenter(Target target) { this.target = target; @@ -279,13 +281,16 @@ public abstract Breaks applyAt( public String toString() { return toString(false); } + + public abstract String toCppOldMonkeyString(); } /** A « treat as » rule. */ public static class RemapRule extends SegmentationRule { public RemapRule(String leftHandSide, String replacement, String line) { - pattern = Pattern.compile(leftHandSide, REGEX_FLAGS); + patternDefinition = leftHandSide; + pattern = Pattern.compile(Builder.expandUnicodeSets(leftHandSide), REGEX_FLAGS); this.replacement = replacement; name = line; } @@ -352,6 +357,7 @@ public void apply( remap.accept(result); } + private String patternDefinition; private Pattern pattern; private String replacement; private String name; @@ -373,6 +379,17 @@ public Breaks applyAt( protected String toString(boolean showResolved) { return name; } + + @Override + public String toCppOldMonkeyString() { + return "std::make_unique(uR\"(" + + name + + ")\", uR\"(" + + patternDefinition.replaceAll("&", "&&").replaceAll("-", "--") + + ")\", uR\"(" + + replacement + + ")\")"; + } } /** A rule that determines the status of an offset. */ @@ -384,6 +401,10 @@ public static class RegexRule extends SegmentationRule { * @param line */ public RegexRule(String before, Breaks result, String after, String line) { + beforeDefinition = before; + afterDefinition = after; + before = Builder.expandUnicodeSets(before); + after = Builder.expandUnicodeSets(after); breaks = result; before = ".*(" + before + ")"; String parsing = null; @@ -453,12 +474,27 @@ public String toString(boolean showResolved) { return result; } + @Override + public String toCppOldMonkeyString() { + return "std::make_unique(uR\"(" + + name + + ")\", uR\"(" + + beforeDefinition.replaceAll("&", "&&").replaceAll("-", "--") + + ")\", u'" + + (breaks == Breaks.BREAK ? '÷' : '×') + + "', uR\"(" + + afterDefinition.replaceAll("&", "&&").replaceAll("-", "--") + + ")\")"; + } + // ============== Internals ================ // We cannot use a single regex of the form "(?<= before) after" because // (RI RI)* RI × RI would require unbounded lookbehind. private Pattern before; private Pattern after; private String name; + private String beforeDefinition; + private String afterDefinition; private String resolved; private Breaks breaks; @@ -474,31 +510,36 @@ public String toString(boolean showResolved) { public static class Builder { private final UnicodeProperty.Factory propFactory; private final Target target; - private XSymbolTable symbolTable; private List rawVariables = new ArrayList(); private Map xmlRules = new TreeMap(); private Map htmlRules = new TreeMap(); private List lastComments = new ArrayList(); class NamedSet { - NamedSet(String name, UnicodeSet set) { + NamedSet(String name, String definition, UnicodeSet set) { this.name = name; + this.definition = definition; this.set = set; } String name; + String definition; UnicodeSet set; } - class NamedRefinedSet { + public class NamedRefinedSet { public NamedRefinedSet clone() { NamedRefinedSet result = new NamedRefinedSet(); for (var term : intersectionTerms) { - result.intersectionTerms.add(new NamedSet(term.name, term.set.cloneAsThawed())); + result.intersectionTerms.add( + new NamedSet(term.name, term.definition, term.set.cloneAsThawed())); } for (var subtrahend : subtrahends) { result.subtrahends.add( - new NamedSet(subtrahend.name, subtrahend.set.cloneAsThawed())); + new NamedSet( + subtrahend.name, + subtrahend.definition, + subtrahend.set.cloneAsThawed())); } result.set = this.set.cloneAsThawed(); return result; @@ -547,6 +588,19 @@ public String getName() { .collect(Collectors.joining()); } + public String getDefinition() { + return intersectionTerms.isEmpty() + ? "[^[]]" + : "[" + + intersectionTerms.stream() + .map((s) -> s.definition) + .collect(Collectors.joining("&")) + + subtrahends.stream() + .map((s) -> "-" + s.definition) + .collect(Collectors.joining()) + + "]"; + } + private UnicodeSet getIntersection() { UnicodeSet result = UnicodeSet.ALL_CODE_POINTS.cloneAsThawed(); for (var term : intersectionTerms) { @@ -565,54 +619,11 @@ private UnicodeSet getIntersection() { public Builder(UnicodeProperty.Factory factory, Target target) { propFactory = factory; this.target = target; - symbolTable = new MyXSymbolTable(); // propFactory.getXSymbolTable(); htmlRules.put(new Double(BREAK_SOT), "sot \u00F7"); htmlRules.put(new Double(BREAK_EOT), "\u00F7 eot"); htmlRules.put(new Double(BREAK_ANY), "\u00F7 Any"); } - // copied to make independent of ICU4J internals - private class MyXSymbolTable extends UnicodeSet.XSymbolTable { - public boolean applyPropertyAlias( - String propertyName, String propertyValue, UnicodeSet result) { - UnicodeProperty prop = propFactory.getProperty(propertyName); - if (prop == null) { - if (propertyValue.isEmpty()) { - prop = propFactory.getProperty("Script"); - result.clear(); - UnicodeSet x = prop.getSet(propertyName, result); - if (!x.isEmpty()) { - return true; - } - } - // If we cannot handle the property name, then we need to really fail. - // If we were to just print something and return false, then the UnicodeSet code - // would just evaluate this itself, and may succeed but give wrong results. - // For example, as long as we require "gc=Cn" and don't handle "Cn" here, - // falling back to built-in ICU data means that we get gc=Cn ranges from ICU - // rather than from the current Unicode beta. - throw new IllegalArgumentException( - "Segmenter.MyXSymbolTable: Unknown property " + propertyName); - } - // Binary properties: - // \p{Extended_Pictographic} is equivalent with \p{Extended_Pictographic=Yes} - if (propertyValue.isEmpty() && prop.isType(UnicodeProperty.BINARY_MASK)) { - propertyValue = "Yes"; - } - result.clear(); - UnicodeSet x = prop.getSet(propertyValue, result); - if (x.isEmpty()) { - // didn't find anything - System.out.println( - "Segmenter.MyXSymbolTable: !Empty! " - + propertyName - + "=" - + propertyValue); - } - return true; // mark that we handled it even if there are no results. - } - } - public String toString(String testName, String indent) { StringBuffer result = new StringBuffer(); @@ -728,10 +739,15 @@ Builder addVariable(String name, String value) { + TransliteratorUtilities.toXML.transliterate(value) + ""); value = replaceVariables(value, variables); + ; if (!name.endsWith("_")) { try { parsePosition.setIndex(0); - UnicodeSet valueSet = new UnicodeSet(value, parsePosition, symbolTable); + UnicodeSet valueSet = + new UnicodeSet( + value, + parsePosition, + IndexUnicodeProperties.make().getXSymbolTable()); if (parsePosition.getIndex() != value.length()) { if (SHOW_SAMPLES) System.out.println( @@ -748,7 +764,7 @@ Builder addVariable(String name, String value) { } else { String name2 = name; if (name2.startsWith("$")) name2 = name2.substring(1); - refinePartition(new NamedSet(name2, valueSet)); + refinePartition(new NamedSet(name2, value, valueSet)); if (SHOW_SAMPLES) { System.out.println("Samples for: " + name + " = " + value); System.out.println("\t" + valueSet); @@ -827,8 +843,7 @@ Builder addRemapRule(Double order, String before, String after, String line) { + " "); rules.put( order, - new Segmenter.RemapRule( - replaceVariables(before, expandedVariables), after, line)); + new Segmenter.RemapRule(replaceVariables(before, variables), after, line)); return this; } @@ -889,9 +904,9 @@ Builder addRegexRule( rules.put( order, new Segmenter.RegexRule( - replaceVariables(before, expandedVariables), + replaceVariables(before, variables), breaks, - replaceVariables(after, expandedVariables), + replaceVariables(after, variables), line)); return this; } @@ -906,6 +921,7 @@ public Segmenter make() { for (Double key : rules.keySet()) { result.add(key.doubleValue(), rules.get(key)); } + result.partitionDefinition = partition; for (var part : partition) { if (part.getName() == null) { throw new IllegalArgumentException("Unclassified characters: " + part.getSet()); @@ -952,14 +968,19 @@ private static String replaceVariables(String input, Map variabl } /** Replaces Unicode Sets with literals. */ - public String expandUnicodeSets(String input) { + public static String expandUnicodeSets(String input) { String result = input; + var parsePosition = new ParsePosition(0); // replace properties // TODO really dumb parse for now, fix later for (int i = 0; i < result.length(); ++i) { if (UnicodeSet.resemblesPattern(result, i)) { parsePosition.setIndex(i); - UnicodeSet temp = new UnicodeSet(result, parsePosition, symbolTable); + UnicodeSet temp = + new UnicodeSet( + result, + parsePosition, + IndexUnicodeProperties.make().getXSymbolTable()); String insert = getInsertablePattern(temp); result = result.substring(0, i) @@ -981,7 +1002,7 @@ public String expandUnicodeSets(String input) { * @param temp * @return */ - private String getInsertablePattern(UnicodeSet temp) { + private static String getInsertablePattern(UnicodeSet temp) { temp.complement().complement(); if (DEBUG_REDUCE_SET_SIZE != null) { UnicodeSet temp2 = new UnicodeSet(temp); @@ -1053,6 +1074,14 @@ public List getRules() { } } + public List getPartitionDefinition() { + return partitionDefinition; + } + + public List getRules() { + return rules; + } + // ============== Internals ================ private List rules = new ArrayList(1);