From ab2752ca3d12e539ddfb316723bb0746b1282383 Mon Sep 17 00:00:00 2001 From: macchiati Date: Tue, 20 Aug 2024 10:19:06 -0700 Subject: [PATCH] CLDR-17830 Regenerate segmentation data --- common/segments/root.xml | 72 ++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/common/segments/root.xml b/common/segments/root.xml index eaddb6b4288..02972fb5103 100644 --- a/common/segments/root.xml +++ b/common/segments/root.xml @@ -1,10 +1,9 @@ @@ -115,7 +114,12 @@ For terms of use, see http://www.unicode.org/copyright.html \p{Line_Break=ZWJ} [$QU & \p{gc=Pi}] [$QU & \p{gc=Pf}] + [$QU - \p{gc=Pi}] + [$QU - \p{gc=Pf}] + [^\p{ea=F}\p{ea=W}\p{ea=H}] + [$BA & $NotEastAsian] + [\u2010] [$CP-[\p{ea=F}\p{ea=W}\p{ea=H}]] [$OP-[\p{ea=F}\p{ea=W}\p{ea=H}]] [\p{Extended_Pictographic}&\p{gc=Cn}] @@ -184,12 +188,18 @@ For terms of use, see http://www.unicode.org/copyright.html ($ZWJ $X) ($QU_Pi $X) ($QU_Pf $X) + ($QUmPi $X) + ($QUmPf $X) + ( $NotEastAsian | [$NotEastAsian - $Spec1_] $X) + (NonEastAsianBA $X) ($DottedCircle $X) + ($Hyphen $X) ($CP30 $X) ($OP30 $X) - + ($AL | ^ $CM | (?<=$Spec1_) $CM) + ( $NotEastAsian | ^ $CM | (?<=$Spec1_) $CM ) @@ -220,11 +230,10 @@ For terms of use, see http://www.unicode.org/copyright.html $Spec3b_ $CM+ × $GL ^ $CM+ × $GL - × $EX - $Spec4_ × ($CL | $CP | $IS | $SY) - $Spec4_ $CM+ × ($CL | $CP | $IS | $SY) - ^ $CM+ × ($CL | $CP | $IS | $SY) + × $CL + × $CP + × $SY $OP $SP* × @@ -234,6 +243,10 @@ For terms of use, see http://www.unicode.org/copyright.html × $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot ) + + $SP ÷ $IS $NU + + × $IS ($CL | $CP) $SP* × $NS @@ -241,18 +254,25 @@ For terms of use, see http://www.unicode.org/copyright.html $SP ÷ - × $QU - $QU × + × $QUmPi + $QUmPf × + + $NotEastAsian × $QU + × $QU ( $NotEastAsian | $eot ) + $QU × $NotEastAsian + ( $sot | $NotEastAsian ) $QU × ÷ $CB $CB ÷ + + ( $sot | $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL ) ( $HY | $Hyphen ) × $AL × $BA × $HY × $NS $BB × - - $HL ($HY | $BA) × + + $HL ($HY | $NonEastAsianBA) × [^$HL] $SY × $HL @@ -266,14 +286,22 @@ For terms of use, see http://www.unicode.org/copyright.html ($PR | $PO) × ($AL | $HL) ($AL | $HL) × ($PR | $PO) - - - - ($PR | $PO) × ( $OP | $HY )? $NU - ( $OP | $HY ) × $NU - $NU × ($NU | $SY | $IS) - $NU ($NU | $SY | $IS)* × ($NU | $SY | $IS | $CL | $CP) - $NU ($NU | $SY | $IS)* ($CL | $CP)? × ($PO | $PR) + + $NU ( $SY | $IS )* $CL × $PO + $NU ( $SY | $IS )* $CP × $PO + $NU ( $SY | $IS )* $CL × $PR + $NU ( $SY | $IS )* $CP × $PR + $NU ( $SY | $IS )* × $PO + $NU ( $SY | $IS )* × $PR + $PO × $OP $NU + $PO × $OP $IS $NU + $PO × $NU + $PR × $OP $NU + $PR × $OP $IS $NU + $PR × $NU + $HY × $NU + $IS × $NU + $NU ( $SY | $IS )* × $NU $JL × $JL | $JV | $H2 | $H3 $JV | $H2 × $JV | $JT @@ -376,7 +404,7 @@ For terms of use, see http://www.unicode.org/copyright.html [\p{Word_Break=Format}] \p{Word_Break=Katakana} \p{Word_Break=ALetter} - [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]] + \p{Word_Break=MidLetter} \p{Word_Break=MidNum} \p{Word_Break=MidNumLet} \p{Word_Break=Numeric}