Skip to content

Commit

Permalink
Merge branch 'remap' into remap-redux
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin committed Oct 14, 2024
2 parents c81086e + b4229ef commit 911069e
Showing 1 changed file with 73 additions and 45 deletions.
118 changes: 73 additions & 45 deletions unicodetools/src/main/java/org/unicode/tools/Segmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,9 @@
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import javax.swing.text.Segment;

import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.util.RegexUtilities;
import org.unicode.cldr.util.TransliteratorUtilities;
Expand Down Expand Up @@ -177,10 +172,11 @@ public boolean breaksAt(CharSequence text, int position) {
return false;
}
StringBuilder remapped = new StringBuilder(text.toString());
Consumer<CharSequence> remap = (s) -> {
remapped.setLength(0);
remapped.append(s);}
;
Consumer<CharSequence> remap =
(s) -> {
remapped.setLength(0);
remapped.append(s);
};
Integer[] indexInRemapped = new Integer[text.length() + 1];
for (int i = 0; i < indexInRemapped.length; ++i) {
indexInRemapped[i] = i;
Expand Down Expand Up @@ -241,13 +237,12 @@ public String toString(boolean showResolved) {
String result = "";
for (int i = 0; i < rules.size(); ++i) {
if (i != 0) result += Utility.LINE_SEPARATOR;
result += orders.get(i) + ")\tTODO";// + rules.get(i).toString(showResolved);
result += orders.get(i) + ")\tTODO"; // + rules.get(i).toString(showResolved);
}
return result;
}


public static abstract class SegmentationRule {
public abstract static class SegmentationRule {
/** Status of a breaking rule */
public enum Breaks {
UNKNOWN_BREAK,
Expand All @@ -257,39 +252,53 @@ public enum Breaks {

/**
* Applies this rule throughout the text.
*
* @param remappedString The text, with any preceding remappings applied.
* @param indexInRemapped An array whose size is one greater than the original string.
* Associates indices in the original string to indices in remappedString.
* indexInRemapped[0] == 0, and indexInRemapped[indexInRemapped.size() - 1] == remappedString.size().
* Whenever indexInRemapped[i] == null,
* resolvedBreaks[i] == NO_BREAK: this corresponds to positions
* inside a string which has been replaced by a remap rule.
* Remap rules may update this mapping.
* @param resolvedBreaks An array whose size is one greater than the original string, indicating resolved breaks in the string.
* Values that are UNKNOWN_BREAK are updated if the rule applies to their position.
* @param remap Called by remap rules with the value of remappedString to be passed to subsequent rules.
* The indices in indexInRemapped are updated consistently.
* Associates indices in the original string to indices in remappedString.
* indexInRemapped[0] == 0, and indexInRemapped[indexInRemapped.size() - 1] ==
* remappedString.size(). Whenever indexInRemapped[i] == null, resolvedBreaks[i] ==
* NO_BREAK: this corresponds to positions inside a string which has been replaced by a
* remap rule. Remap rules may update this mapping.
* @param resolvedBreaks An array whose size is one greater than the original string,
* indicating resolved breaks in the string. Values that are UNKNOWN_BREAK are updated
* if the rule applies to their position.
* @param remap Called by remap rules with the value of remappedString to be passed to
* subsequent rules. The indices in indexInRemapped are updated consistently.
*/
public abstract void apply(CharSequence remappedString, Integer[] indexInRemapped, Breaks[] resolvedBreaks, Consumer<CharSequence> remap);
/** Same as above, but only returns the resolution at the current position. */
public abstract Breaks applyAt(int position, CharSequence remappedString, Integer[] indexInRemapped, Consumer<CharSequence> remap);
public abstract void apply(
CharSequence remappedString,
Integer[] indexInRemapped,
Breaks[] resolvedBreaks,
Consumer<CharSequence> remap);

/** Same as above, but only returns the resolution at the current position. */
public abstract Breaks applyAt(
int position,
CharSequence remappedString,
Integer[] indexInRemapped,
Consumer<CharSequence> remap);
}

/** A « treat as » rule. */
public static class RemapRule extends SegmentationRule {

public RemapRule(String leftHandSide, String replacement) {
pattern = Pattern.compile(leftHandSide, REGEX_FLAGS);
this.replacement = replacement;
}

public void apply(CharSequence remappedString, Integer[] indexInRemapped, Breaks[] resolvedBreaks, Consumer<CharSequence> remap) {

public void apply(
CharSequence remappedString,
Integer[] indexInRemapped,
Breaks[] resolvedBreaks,
Consumer<CharSequence> remap) {
final var result = new StringBuilder();
int i = 0;
int offset = 0;
final var matcher = pattern.matcher(remappedString);
while (matcher.find()) {
for (;; ++i) {
for (; ; ++i) {
if (indexInRemapped[i] == null) {
continue;
}
Expand All @@ -298,15 +307,19 @@ public void apply(CharSequence remappedString, Integer[] indexInRemapped, Breaks
}
indexInRemapped[i] += offset;
}
for (;; ++i) {
for (; ; ++i) {
if (indexInRemapped[i] == null) {
continue;
}
if (indexInRemapped[i] == matcher.end()) {
break;
}
if (resolvedBreaks[i] == Breaks.BREAK) {
throw new IllegalArgumentException("Replacement rule at remapped indices " + matcher.start() + " sqq. spans a break: " + remappedString);
throw new IllegalArgumentException(
"Replacement rule at remapped indices "
+ matcher.start()
+ " sqq. spans a break: "
+ remappedString);
}
resolvedBreaks[i] = Breaks.NO_BREAK;
indexInRemapped[i] = null;
Expand All @@ -323,23 +336,33 @@ public void apply(CharSequence remappedString, Integer[] indexInRemapped, Breaks
matcher.appendTail(result);
if (indexInRemapped[indexInRemapped.length - 1] != result.length()) {
StringBuilder meow = new StringBuilder();
for (var j : indexInRemapped) {
for (var j : indexInRemapped) {
meow.append(j == null ? "null" : j.toString());
meow.append(",");
}
throw new IllegalArgumentException("Inconsistent indexInRemapped " + meow + " for new remapped string " + result);
throw new IllegalArgumentException(
"Inconsistent indexInRemapped "
+ meow
+ " for new remapped string "
+ result);
}
remap.accept(result);
}

private Pattern pattern;
private String replacement;

@Override
public Breaks applyAt(int position, CharSequence remappedString, Integer[] indexInRemapped,
public Breaks applyAt(
int position,
CharSequence remappedString,
Integer[] indexInRemapped,
Consumer<CharSequence> remap) {
var resolvedBreaks = new Breaks[indexInRemapped.length];
apply(remappedString, indexInRemapped, resolvedBreaks, remap);
return resolvedBreaks[position] == null ? Breaks.UNKNOWN_BREAK : resolvedBreaks[position];
return resolvedBreaks[position] == null
? Breaks.UNKNOWN_BREAK
: resolvedBreaks[position];
}
}

Expand Down Expand Up @@ -384,17 +407,25 @@ public RegexRule(String before, Breaks result, String after, String line) {
// COMMENTS allows whitespace
}

public void apply(CharSequence remappedString, Integer[] indexInRemapped, Breaks[] resolvedBreaks, Consumer<CharSequence> remap) {
public void apply(
CharSequence remappedString,
Integer[] indexInRemapped,
Breaks[] resolvedBreaks,
Consumer<CharSequence> remap) {
for (int i = 0; i < indexInRemapped.length; ++i) {
if (resolvedBreaks[i] == Breaks.UNKNOWN_BREAK) {
resolvedBreaks[i] = applyAt(i, remappedString, indexInRemapped, remap);
}
}
}

public Breaks applyAt(int position, CharSequence remappedString, Integer[] indexInRemapped, Consumer<CharSequence> remap) {
if (matchAfter(matchSucceeding, remappedString, indexInRemapped[position]) &&
matchBefore(matchPrevious, remappedString, indexInRemapped[position])) {
public Breaks applyAt(
int position,
CharSequence remappedString,
Integer[] indexInRemapped,
Consumer<CharSequence> remap) {
if (matchAfter(matchSucceeding, remappedString, indexInRemapped[position])
&& matchBefore(matchPrevious, remappedString, indexInRemapped[position])) {
return breaks;
}
return Breaks.UNKNOWN_BREAK;
Expand Down Expand Up @@ -706,7 +737,6 @@ public static UnicodeMap<String> composeWith(
return target;
}


Builder addRemapRule(Double order, String before, String after, String line) {
line = whiteSpace.reset(line).replaceAll(" ");
if (lastComments.size() != 0) {
Expand Down Expand Up @@ -737,10 +767,7 @@ Builder addRemapRule(Double order, String before, String after, String line) {
+ "> "
+ TransliteratorUtilities.toXML.transliterate(line)
+ " </rule>");
rules.put(
order,
new Segmenter.RemapRule(
replaceVariables(before), after));
rules.put(order, new Segmenter.RemapRule(replaceVariables(before), after));
return this;
}

Expand All @@ -754,7 +781,8 @@ Builder addRemapRule(Double order, String before, String after, String line) {
* @param line
* @return
*/
Builder addRegexRule(Double order, String before, Breaks breaks, String after, String line) {
Builder addRegexRule(
Double order, String before, Breaks breaks, String after, String line) {
// if (brokenIdentifierMatcher.reset(line).find()) {
// int start = brokenIdentifierMatcher.start();
// int end = brokenIdentifierMatcher.end();
Expand Down

0 comments on commit 911069e

Please sign in to comment.