From 85aadc46593a8633024de5f7f4c6080307c499ce Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 15 Mar 2024 12:21:04 +0100 Subject: [PATCH] ICU-22707 UTC-179-C28 LB19 change for simplified chinese --- icu4c/source/data/brkitr/rules/line.txt | 25 ++++++- icu4c/source/test/intltest/rbbitst.cpp | 88 +++++++++++++++++++++++-- 2 files changed, 104 insertions(+), 9 deletions(-) diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index 538e3865f3e0..49dd0d17eb05 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -35,7 +35,7 @@ $BK = [:LineBreak = Mandatory_Break:]; $B2 = [:LineBreak = Break_Both:]; $CB = [:LineBreak = Contingent_Break:]; $CJ = [:LineBreak = Conditional_Japanese_Starter:]; -$CL = [:LineBreak = Close_Punctuation:]; +$CL = [[:LineBreak = Close_Punctuation:]]; # $CM = [:LineBreak = Combining_Mark:]; $CP = [:LineBreak = Close_Parenthesis:]; $CR = [:LineBreak = Carriage_Return:]; @@ -251,6 +251,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; # # LB 15d Do not break before numeric separators (IS), even after spaces. +# SP IS QU is handled below as part of LB 19. [$LB8NonBreaks - $SP] $IS; $SP $IS $CM* [$CanFollowIS {eof}]; @@ -274,12 +275,27 @@ $LB18Breaks = [$LB8Breaks $SP]; # LB 19 -# x QU $LB18NonBreaks $CM* $QU; ^$CM+ $QU; -# QU x +[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; + $QU $CM* .; +[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; + +$SP [$IS & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [$QU & \p{Pi}] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +$SP [$IS & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [$QU & \p{Pi}] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +$SP [$IS & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [$QU & \p{Pf}] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +$SP [$IS & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [$QU & \p{Pf}] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; + +^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; # LB 20 # $CB @@ -287,6 +303,9 @@ $QU $CM* .; # $LB20NonBreaks = [$LB18NonBreaks - $CB]; +[$LB20NonBreaks - $HL] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +[$LB20NonBreaks - $HL] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; + # LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. # Originally added as a Finnish tailoring, now promoted to default ICU behavior. # Note: this is not default UAX-14 behaviour. See issue ICU-8151. diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 4151a32055f1..5111b0119b3a 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -2706,6 +2706,7 @@ class RBBILineMonkey: public RBBIMonkeyKind { UnicodeSet *fVI; UnicodeSet *fPi; UnicodeSet *fPf; + UnicodeSet *feaFWH; BreakIterator *fCharBI; const UnicodeString *fText; @@ -2785,6 +2786,8 @@ RBBILineMonkey::RBBILineMonkey() : fPi = new UnicodeSet(uR"([\p{Pi}])", status); fPf = new UnicodeSet(uR"([\p{Pf}])", status); + feaFWH = new UnicodeSet(uR"([\p{ea=F}\p{ea=W}\p{ea=H}])", status); + if (U_FAILURE(status)) { deferredStatus = status; return; @@ -2916,9 +2919,23 @@ void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos // LB 9 Treat X CM* as if it were x. // No explicit action required. - // LB 10 Treat any remaining combining mark as AL + // LB 10 Treat any remaining combining mark as AL, but preserve its East + // Asian Width. if (fCM->contains(*posChar)) { - *posChar = u'A'; + switch (u_getIntPropertyValue(*posChar, UCHAR_EAST_ASIAN_WIDTH)) { + case U_EA_WIDE: + *posChar = u'♈'; + break; + case U_EA_NEUTRAL: + *posChar = u'ᴬ'; + break; + case U_EA_AMBIGUOUS: + *posChar = u'Ⓐ'; + break; + default: + puts("Unexpected ea value for lb=CM"); + std::terminate(); + } } // Push the updated nextPos and nextChar back to our caller. @@ -3231,12 +3248,70 @@ int32_t RBBILineMonkey::next(int32_t startPos) { break; } - // x QU - // QU x - if (fQU->contains(thisChar) || fQU->contains(prevChar)) { - setAppliedRule(pos, "LB 19"); + // LB 19 + // × [QU-\p{Pi}] + if (fQU->contains(thisChar) && !fPi->contains(thisChar)) { + setAppliedRule(pos, "LB 19 × [QU-\\p{Pi}]"); + continue; + } + // [^\p{ea=F}\p{ea=W}\p{ea=H}] × [\p{Pi}&QU] + if (!feaFWH->contains(prevChar) && fPi->contains(thisChar) && fQU->contains(thisChar)) { + setAppliedRule(pos, "LB 19 [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] × [\\p{Pi}&QU]"); + continue; + } + // × [\p{Pi}&QU] ( [^\p{ea=F}\p{ea=W}\p{ea=H}] | eot ) + if (fPi->contains(thisChar) && fQU->contains(thisChar)) { + if (nextPos < fText->length()) { + UChar32 nextChar = fText->char32At(nextPos); + if (!feaFWH->contains(nextChar)) { + setAppliedRule(pos, "LB 19 × [\\p{Pi}&QU] [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]"); + continue; + } + } else { + setAppliedRule(pos, "LB 19 × [\\p{Pi}&QU] eot"); + continue; + } + } + + // [QU-\p{Pf}] × + if (fQU->contains(prevChar) && !fPf->contains(prevChar)) { + setAppliedRule(pos, "LB 19 [QU-\\p{Pf}] ×"); + continue; + } + // [\p{Pf}&QU] × [^\p{ea=F}\p{ea=W}\p{ea=H}] + if (fPf->contains(prevChar) && fQU->contains(prevChar) && !feaFWH->contains(thisChar)) { + setAppliedRule(pos, "LB 19 [\\p{Pf}&QU] × [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]"); continue; } + // ( sot | [^\p{ea=F}\p{ea=W}\p{ea=H}] ) [\p{Pf}&QU] × + if (fPf->contains(prevChar) && fQU->contains(prevChar)) { + if (prevPos == 0) { + setAppliedRule(pos, "LB 19 sot [\\p{Pf}&QU] ×"); + continue; + } + // prevPosX2 is -1 if there was a break, and prevCharX2 is 0; but the UAX #14 rules can + // look through breaks. + int breakObliviousPrevPosX2 = fText->moveIndex32(prevPos, -1); + while (fCM->contains(fText->char32At(breakObliviousPrevPosX2))) { + if (breakObliviousPrevPosX2 == 0) { + break; + } + int beforeCM = fText->moveIndex32(breakObliviousPrevPosX2, -1); + if (fBK->contains(fText->char32At(beforeCM)) || + fCR->contains(fText->char32At(beforeCM)) || + fLF->contains(fText->char32At(beforeCM)) || + fNL->contains(fText->char32At(beforeCM)) || + fSP->contains(fText->char32At(beforeCM)) || + fZW->contains(fText->char32At(beforeCM))) { + break; + } + breakObliviousPrevPosX2 = beforeCM; + } + if (!feaFWH->contains(fText->char32At(breakObliviousPrevPosX2))) { + setAppliedRule(pos, "LB 19 [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] [\\p{Pf}&QU] ×"); + continue; + } + } if (fCB->contains(thisChar) || fCB->contains(prevChar)) { setAppliedRule(pos, "LB 20 Break around a CB"); @@ -3615,6 +3690,7 @@ RBBILineMonkey::~RBBILineMonkey() { delete fVI; delete fPi; delete fPf; + delete feaFWH; delete fCharBI; delete fNumberMatcher;