Skip to content

Commit

Permalink
Generate old monkeys (#979)
Browse files Browse the repository at this point in the history
* An extra file

* divergent syntaxes

* spots
  • Loading branch information
eggrobin authored Dec 4, 2024
1 parent f930b35 commit 8459b0b
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ public abstract class GenerateBreakTest implements UCD_Types {
Normalizer nfd;
Normalizer nfkd;

Segmenter segmenter;
UnicodeMap<String> partition;
UnicodeProperty prop;

Expand Down Expand Up @@ -322,6 +323,7 @@ public void run() throws IOException {

boolean forCLDR = seg.target == Segmenter.Target.FOR_CLDR;
String path = "UCD/" + ucd.getVersion() + '/' + (forCLDR ? "cldr/" : "auxiliary/");
String extraPath = "UCD/" + ucd.getVersion() + "/extra/";
String outFilename = fileName + "BreakTest";
if (forCLDR) {
outFilename = outFilename + "-cldr";
Expand Down Expand Up @@ -477,6 +479,37 @@ value, new ParsePosition(0), IUP.getXSymbolTable()))) {
fc.close();

generateTest(false, path, outFilename, propertyName);
generateCppOldMonkeys(extraPath, outFilename);
}

private void generateCppOldMonkeys(String path, String outFilename) throws IOException {
final UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(path, outFilename + ".cpp");
final PrintWriter out = fc.out;
out.println();
out.println("####### Instructions ##################################");
out.println("# Copy the following lines into rbbitst.cpp in ICU4C, #");
out.println(
"# in the constructor of RBBIMeowMonkey, replacing the #"
.replace("Meow", outFilename.substring(0, 4).replace("Graph", "Char")));
out.println("# existing block of generated code. #");
out.println("#######################################################");
out.println();
out.println(" // --- NOLI ME TANGERE ---");
out.println(" // Generated by GenerateBreakTest.java in the Unicode tools.");
for (Segmenter.Builder.NamedRefinedSet part : segmenter.getPartitionDefinition()) {
out.println(
" partition.emplace_back(\""
+ part.getName()
+ "\", UnicodeSet(uR\"("
+ part.getDefinition()
+ ")\", status));");
}
out.println();
for (Segmenter.SegmentationRule rule : segmenter.getRules()) {
out.println(" rules.push_back(" + rule.toCppOldMonkeyString() + ");");
}
out.println(" // --- End of generated code. ---");
fc.close();
}

private void generateTest(
Expand Down Expand Up @@ -1091,6 +1124,7 @@ public XGenerateBreakTest(
}
variables = segBuilder.getVariables();
collectingRules = false;
segmenter = seg;
partition = seg.getSamples();
fileName = filename;
propertyName = (filename.equals("Grapheme") ? "Grapheme_Cluster" : fileName) + "_Break";
Expand Down
147 changes: 88 additions & 59 deletions unicodetools/src/main/java/org/unicode/tools/Segmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSet.SpanCondition;
import com.ibm.icu.text.UnicodeSet.XSymbolTable;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ULocale;
import java.text.ParsePosition;
Expand All @@ -36,7 +35,9 @@
import java.util.stream.Collectors;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.util.TransliteratorUtilities;
import org.unicode.props.IndexUnicodeProperties;
import org.unicode.props.UnicodeProperty;
import org.unicode.tools.Segmenter.Builder.NamedRefinedSet;
import org.unicode.tools.Segmenter.SegmentationRule.Breaks;

/** Ordered list of rules, with variables resolved before building. Use Builder to make. */
Expand Down Expand Up @@ -68,6 +69,7 @@ public enum Target {
public final Target target;

private UnicodeMap<String> samples = new UnicodeMap<String>();
private List<NamedRefinedSet> partitionDefinition = new ArrayList<>();

private Segmenter(Target target) {
this.target = target;
Expand Down Expand Up @@ -279,13 +281,16 @@ public abstract Breaks applyAt(
public String toString() {
return toString(false);
}

public abstract String toCppOldMonkeyString();
}

/** A « treat as » rule. */
public static class RemapRule extends SegmentationRule {

public RemapRule(String leftHandSide, String replacement, String line) {
pattern = Pattern.compile(leftHandSide, REGEX_FLAGS);
patternDefinition = leftHandSide;
pattern = Pattern.compile(Builder.expandUnicodeSets(leftHandSide), REGEX_FLAGS);
this.replacement = replacement;
name = line;
}
Expand Down Expand Up @@ -352,6 +357,7 @@ public void apply(
remap.accept(result);
}

private String patternDefinition;
private Pattern pattern;
private String replacement;
private String name;
Expand All @@ -373,6 +379,17 @@ public Breaks applyAt(
protected String toString(boolean showResolved) {
return name;
}

@Override
public String toCppOldMonkeyString() {
return "std::make_unique<RemapRule>(uR\"("
+ name
+ ")\", uR\"("
+ patternDefinition.replaceAll("&", "&&").replaceAll("-", "--")
+ ")\", uR\"("
+ replacement
+ ")\")";
}
}

/** A rule that determines the status of an offset. */
Expand All @@ -384,6 +401,10 @@ public static class RegexRule extends SegmentationRule {
* @param line
*/
public RegexRule(String before, Breaks result, String after, String line) {
beforeDefinition = before;
afterDefinition = after;
before = Builder.expandUnicodeSets(before);
after = Builder.expandUnicodeSets(after);
breaks = result;
before = ".*(" + before + ")";
String parsing = null;
Expand Down Expand Up @@ -453,12 +474,27 @@ public String toString(boolean showResolved) {
return result;
}

@Override
public String toCppOldMonkeyString() {
return "std::make_unique<RegexRule>(uR\"("
+ name
+ ")\", uR\"("
+ beforeDefinition.replaceAll("&", "&&").replaceAll("-", "--")
+ ")\", u'"
+ (breaks == Breaks.BREAK ? '÷' : '×')
+ "', uR\"("
+ afterDefinition.replaceAll("&", "&&").replaceAll("-", "--")
+ ")\")";
}

// ============== Internals ================
// We cannot use a single regex of the form "(?<= before) after" because
// (RI RI)* RI × RI would require unbounded lookbehind.
private Pattern before;
private Pattern after;
private String name;
private String beforeDefinition;
private String afterDefinition;

private String resolved;
private Breaks breaks;
Expand All @@ -474,31 +510,36 @@ public String toString(boolean showResolved) {
public static class Builder {
private final UnicodeProperty.Factory propFactory;
private final Target target;
private XSymbolTable symbolTable;
private List<String> rawVariables = new ArrayList<String>();
private Map<Double, String> xmlRules = new TreeMap<Double, String>();
private Map<Double, String> htmlRules = new TreeMap<Double, String>();
private List<String> lastComments = new ArrayList<String>();

class NamedSet {
NamedSet(String name, UnicodeSet set) {
NamedSet(String name, String definition, UnicodeSet set) {
this.name = name;
this.definition = definition;
this.set = set;
}

String name;
String definition;
UnicodeSet set;
}

class NamedRefinedSet {
public class NamedRefinedSet {
public NamedRefinedSet clone() {
NamedRefinedSet result = new NamedRefinedSet();
for (var term : intersectionTerms) {
result.intersectionTerms.add(new NamedSet(term.name, term.set.cloneAsThawed()));
result.intersectionTerms.add(
new NamedSet(term.name, term.definition, term.set.cloneAsThawed()));
}
for (var subtrahend : subtrahends) {
result.subtrahends.add(
new NamedSet(subtrahend.name, subtrahend.set.cloneAsThawed()));
new NamedSet(
subtrahend.name,
subtrahend.definition,
subtrahend.set.cloneAsThawed()));
}
result.set = this.set.cloneAsThawed();
return result;
Expand Down Expand Up @@ -547,6 +588,19 @@ public String getName() {
.collect(Collectors.joining());
}

public String getDefinition() {
return intersectionTerms.isEmpty()
? "[^[]]"
: "["
+ intersectionTerms.stream()
.map((s) -> s.definition)
.collect(Collectors.joining("&"))
+ subtrahends.stream()
.map((s) -> "-" + s.definition)
.collect(Collectors.joining())
+ "]";
}

private UnicodeSet getIntersection() {
UnicodeSet result = UnicodeSet.ALL_CODE_POINTS.cloneAsThawed();
for (var term : intersectionTerms) {
Expand All @@ -565,54 +619,11 @@ private UnicodeSet getIntersection() {
public Builder(UnicodeProperty.Factory factory, Target target) {
propFactory = factory;
this.target = target;
symbolTable = new MyXSymbolTable(); // propFactory.getXSymbolTable();
htmlRules.put(new Double(BREAK_SOT), "sot \u00F7");
htmlRules.put(new Double(BREAK_EOT), "\u00F7 eot");
htmlRules.put(new Double(BREAK_ANY), "\u00F7 Any");
}

// copied to make independent of ICU4J internals
private class MyXSymbolTable extends UnicodeSet.XSymbolTable {
public boolean applyPropertyAlias(
String propertyName, String propertyValue, UnicodeSet result) {
UnicodeProperty prop = propFactory.getProperty(propertyName);
if (prop == null) {
if (propertyValue.isEmpty()) {
prop = propFactory.getProperty("Script");
result.clear();
UnicodeSet x = prop.getSet(propertyName, result);
if (!x.isEmpty()) {
return true;
}
}
// If we cannot handle the property name, then we need to really fail.
// If we were to just print something and return false, then the UnicodeSet code
// would just evaluate this itself, and may succeed but give wrong results.
// For example, as long as we require "gc=Cn" and don't handle "Cn" here,
// falling back to built-in ICU data means that we get gc=Cn ranges from ICU
// rather than from the current Unicode beta.
throw new IllegalArgumentException(
"Segmenter.MyXSymbolTable: Unknown property " + propertyName);
}
// Binary properties:
// \p{Extended_Pictographic} is equivalent with \p{Extended_Pictographic=Yes}
if (propertyValue.isEmpty() && prop.isType(UnicodeProperty.BINARY_MASK)) {
propertyValue = "Yes";
}
result.clear();
UnicodeSet x = prop.getSet(propertyValue, result);
if (x.isEmpty()) {
// didn't find anything
System.out.println(
"Segmenter.MyXSymbolTable: !Empty! "
+ propertyName
+ "="
+ propertyValue);
}
return true; // mark that we handled it even if there are no results.
}
}

public String toString(String testName, String indent) {

StringBuffer result = new StringBuffer();
Expand Down Expand Up @@ -728,10 +739,15 @@ Builder addVariable(String name, String value) {
+ TransliteratorUtilities.toXML.transliterate(value)
+ "</variable>");
value = replaceVariables(value, variables);
;
if (!name.endsWith("_")) {
try {
parsePosition.setIndex(0);
UnicodeSet valueSet = new UnicodeSet(value, parsePosition, symbolTable);
UnicodeSet valueSet =
new UnicodeSet(
value,
parsePosition,
IndexUnicodeProperties.make().getXSymbolTable());
if (parsePosition.getIndex() != value.length()) {
if (SHOW_SAMPLES)
System.out.println(
Expand All @@ -748,7 +764,7 @@ Builder addVariable(String name, String value) {
} else {
String name2 = name;
if (name2.startsWith("$")) name2 = name2.substring(1);
refinePartition(new NamedSet(name2, valueSet));
refinePartition(new NamedSet(name2, value, valueSet));
if (SHOW_SAMPLES) {
System.out.println("Samples for: " + name + " = " + value);
System.out.println("\t" + valueSet);
Expand Down Expand Up @@ -827,8 +843,7 @@ Builder addRemapRule(Double order, String before, String after, String line) {
+ " </rule>");
rules.put(
order,
new Segmenter.RemapRule(
replaceVariables(before, expandedVariables), after, line));
new Segmenter.RemapRule(replaceVariables(before, variables), after, line));
return this;
}

Expand Down Expand Up @@ -889,9 +904,9 @@ Builder addRegexRule(
rules.put(
order,
new Segmenter.RegexRule(
replaceVariables(before, expandedVariables),
replaceVariables(before, variables),
breaks,
replaceVariables(after, expandedVariables),
replaceVariables(after, variables),
line));
return this;
}
Expand All @@ -906,6 +921,7 @@ public Segmenter make() {
for (Double key : rules.keySet()) {
result.add(key.doubleValue(), rules.get(key));
}
result.partitionDefinition = partition;
for (var part : partition) {
if (part.getName() == null) {
throw new IllegalArgumentException("Unclassified characters: " + part.getSet());
Expand Down Expand Up @@ -952,14 +968,19 @@ private static String replaceVariables(String input, Map<String, String> variabl
}

/** Replaces Unicode Sets with literals. */
public String expandUnicodeSets(String input) {
public static String expandUnicodeSets(String input) {
String result = input;
var parsePosition = new ParsePosition(0);
// replace properties
// TODO really dumb parse for now, fix later
for (int i = 0; i < result.length(); ++i) {
if (UnicodeSet.resemblesPattern(result, i)) {
parsePosition.setIndex(i);
UnicodeSet temp = new UnicodeSet(result, parsePosition, symbolTable);
UnicodeSet temp =
new UnicodeSet(
result,
parsePosition,
IndexUnicodeProperties.make().getXSymbolTable());
String insert = getInsertablePattern(temp);
result =
result.substring(0, i)
Expand All @@ -981,7 +1002,7 @@ public String expandUnicodeSets(String input) {
* @param temp
* @return
*/
private String getInsertablePattern(UnicodeSet temp) {
private static String getInsertablePattern(UnicodeSet temp) {
temp.complement().complement();
if (DEBUG_REDUCE_SET_SIZE != null) {
UnicodeSet temp2 = new UnicodeSet(temp);
Expand Down Expand Up @@ -1053,6 +1074,14 @@ public List<String> getRules() {
}
}

public List<NamedRefinedSet> getPartitionDefinition() {
return partitionDefinition;
}

public List<SegmentationRule> getRules() {
return rules;
}

// ============== Internals ================

private List<SegmentationRule> rules = new ArrayList<SegmentationRule>(1);
Expand Down

0 comments on commit 8459b0b

Please sign in to comment.