From 4af53431bf47f89f0264d7c19f3525606767e2eb Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 14 Oct 2024 13:26:46 +0200 Subject: [PATCH 1/2] Remove SegmenterCldr.txt (#947) --- .../unicode/text/UCD/GenerateBreakTest.java | 2 - .../unicode/text/UCD/MakeUnicodeFiles.java | 5 - .../org/unicode/text/UCD/MakeUnicodeFiles.txt | 3 - .../org/unicode/tools/SegmenterCldr.txt | 529 ------------------ .../org/unicode/tools/SegmenterDefault.txt | 2 - 5 files changed, 541 deletions(-) delete mode 100644 unicodetools/src/main/resources/org/unicode/tools/SegmenterCldr.txt diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java index 5d2bc8373..ff506b4ee 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java @@ -81,8 +81,6 @@ public static void main(String[] args) throws IOException { new GenerateWordBreakTest(ucd, Segmenter.Target.FOR_UCD).run(); new GenerateLineBreakTest(ucd, Segmenter.Target.FOR_UCD).run(); new GenerateSentenceBreakTest(ucd, Segmenter.Target.FOR_UCD).run(); - - new GenerateGraphemeBreakTest(ucd, Segmenter.Target.FOR_CLDR).run(); } GenerateBreakTest(UCD ucd, Segmenter seg) { diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java index 0c14d0f2d..7fea779e6 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java @@ -599,11 +599,6 @@ public static void generateFile(String filename) throws IOException { Default.ucd(), Segmenter.Target.FOR_UCD) .run(); break; - case "GraphemeBreakTest-cldr": - new GenerateBreakTest.GenerateGraphemeBreakTest( - Default.ucd(), Segmenter.Target.FOR_CLDR) - .run(); - break; case "DerivedName": case "DerivedLabel": generateDerivedName(filename); diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt index 3a3b5d608..357d1bf6d 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt @@ -92,9 +92,6 @@ Property: SPECIAL File: auxiliary/SentenceBreakTest Property: SPECIAL -File: cldr/GraphemeBreakTest-cldr -Property: SPECIAL - File: extra/BidiPairedBracketType Property: BidiPairedBracketType Format: valueList skipUnassigned=None diff --git a/unicodetools/src/main/resources/org/unicode/tools/SegmenterCldr.txt b/unicodetools/src/main/resources/org/unicode/tools/SegmenterCldr.txt deleted file mode 100644 index 5a2f4c8bf..000000000 --- a/unicodetools/src/main/resources/org/unicode/tools/SegmenterCldr.txt +++ /dev/null @@ -1,529 +0,0 @@ -@GraphemeClusterBreak -## double ## at the start of a line doesn't show up -## use Segmenter.Target.FOR_CLDR to select SegmenterCldr.txt - -# VARIABLES - -$CR=\p{Grapheme_Cluster_Break=CR} -$LF=\p{Grapheme_Cluster_Break=LF} -$Control=\p{Grapheme_Cluster_Break=Control} -$Extend=\p{Grapheme_Cluster_Break=Extend} -$ZWJ=\p{Grapheme_Cluster_Break=ZWJ} -$RI=\p{Grapheme_Cluster_Break=Regional_Indicator} -$Prepend=\p{Grapheme_Cluster_Break=Prepend} -$SpacingMark=\p{Grapheme_Cluster_Break=SpacingMark} -$L=\p{Grapheme_Cluster_Break=L} -$V=\p{Grapheme_Cluster_Break=V} -$T=\p{Grapheme_Cluster_Break=T} -$LV=\p{Grapheme_Cluster_Break=LV} -$LVT=\p{Grapheme_Cluster_Break=LVT} -# Note: The following may overlap with the above -# Note: ConjunctLinkingScripts is not used anymore, instead that list exists in the derivation of Indic_Conjunct_Break. -# It is kept here so that the diff of the generated test cases compared to the Unicode 15.1 β is minimal. -# TODO(egg): Consider removing in Unicode 16.0. -$ConjunctLinkingScripts=[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] -$ConjunctLinker=\p{Indic_Conjunct_Break=Linker} -$LinkingConsonant=\p{Indic_Conjunct_Break=Consonant} -## $E_Base=\p{Grapheme_Cluster_Break=E_Base} -## $E_Modifier=\p{Grapheme_Cluster_Break=E_Modifier} -$ExtPict=\p{Extended_Pictographic} -$ExtCccZwj=[\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}] -## $EBG=\p{Grapheme_Cluster_Break=E_Base_GAZ} -## $Glue_After_Zwj=\p{Grapheme_Cluster_Break=Glue_After_Zwj} - -# RULES - -# Break at the start and end of text, unless the text is empty. -# Do not break between a CR and LF. Otherwise, break before and after controls. -3) $CR × $LF -4) ( $Control | $CR | $LF ) ÷ -5) ÷ ( $Control | $CR | $LF ) -# Do not break Hangul syllable sequences. -6) $L × ( $L | $V | $LV | $LVT ) -7) ( $LV | $V ) × ( $V | $T ) -8) ( $LVT | $T) × $T -## Do not break before extending characters or ZWJ. -## 9) × ($Extend | $ZWJ | $ConjunctLinker) -9) × ($Extend | $ZWJ) -# Only for extended grapheme clusters: Do not break before SpacingMarks, or after Prepend characters. -9.1) × $SpacingMark -9.2) $Prepend × -9.3) $LinkingConsonant $ExtCccZwj* $ConjunctLinker $ExtCccZwj* × $LinkingConsonant -## Do not break within emoji modifier sequences or emoji zwj sequences. -## 10) $E_Base $Extend* × $E_Modifier -11) $ExtPict $Extend* $ZWJ × $ExtPict -# Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point. -12) ^ ($RI $RI)* $RI × $RI -13) [^$RI] ($RI $RI)* $RI × $RI -# Otherwise, break everywhere. - -@LineBreak - -# VARIABLES - -$AI=\p{Line_Break=Ambiguous} -$AK=\p{Line_Break=Aksara} -$AL=\p{Line_Break=Alphabetic} -$AP=\p{Line_Break=Aksara_Prebase} -$AS=\p{Line_Break=Aksara_Start} -$B2=\p{Line_Break=Break_Both} -$BA=\p{Line_Break=Break_After} -$BB=\p{Line_Break=Break_Before} -$BK=\p{Line_Break=Mandatory_Break} -$CB=\p{Line_Break=Contingent_Break} -$CL=\p{Line_Break=Close_Punctuation} -$CP=\p{Line_Break=CP} -$CM1=\p{Line_Break=Combining_Mark} -$CR=\p{Line_Break=Carriage_Return} -$EX=\p{Line_Break=Exclamation} -$GL=\p{Line_Break=Glue} -$H2=\p{Line_Break=H2} -$H3=\p{Line_Break=H3} -$HL=\p{Line_Break=HL} -$HY=\p{Line_Break=Hyphen} -$ID=\p{Line_Break=Ideographic} -$IN=\p{Line_Break=Inseparable} -$IS=\p{Line_Break=Infix_Numeric} -$JL=\p{Line_Break=JL} -$JT=\p{Line_Break=JT} -$JV=\p{Line_Break=JV} -$LF=\p{Line_Break=Line_Feed} -$NL=\p{Line_Break=Next_Line} -$NS=\p{Line_Break=Nonstarter} -$NU=\p{Line_Break=Numeric} -$OP=\p{Line_Break=Open_Punctuation} -$PO=\p{Line_Break=Postfix_Numeric} -$PR=\p{Line_Break=Prefix_Numeric} -$QU=\p{Line_Break=Quotation} -$SA=\p{Line_Break=Complex_Context} -$SG=\p{Line_Break=Surrogate} -$SP=\p{Line_Break=Space} -$SY=\p{Line_Break=Break_Symbols} -$VF=\p{Line_Break=Virama_Final} -$VI=\p{Line_Break=Virama} -$WJ=\p{Line_Break=Word_Joiner} -$XX=\p{Line_Break=Unknown} -$ZW=\p{Line_Break=ZWSpace} -$CJ=\p{Line_Break=Conditional_Japanese_Starter} -$RI=\p{Line_Break=Regional_Indicator} -$EB=\p{Line_Break=E_Base} -$EM=\p{Line_Break=E_Modifier} -$ZWJ_O=\p{Line_Break=ZWJ} -$ZWJ=\p{Line_Break=ZWJ} - -$QU_Pi=($QU_Pi $X) -$QU_Pf=($QU_Pf $X) - -$DottedCircle = ◌ - -$CP30=[$CP-[\p{ea=F}\p{ea=W}\p{ea=H}]] -$OP30=[$OP-[\p{ea=F}\p{ea=W}\p{ea=H}]] - -$ExtPictUnassigned=[\p{Extended_Pictographic}&\p{gc=Cn}] - -# Some rules refer to the start and end of text. We could just use a literal ^ for sot, but naming -# it as in the spec makes it easier to compare. The parser will eat (and choke on) $, so we play a -# stupid trick instead. -$sot=^ -$eot=(?!.) - -# SPECIAL EXTENSIONS - -$CM=[$CM1 $ZWJ] -# LB 1 Assign a line breaking class to each code point of the input. -# Resolve AI, CB, SA, SG, and XX into other line breaking classes depending on criteria outside the scope of this algorithm. -# NOTE: CB is ok to fall through, but must handle others here. -## show $AL -$AL=[$AI $AL $SG $XX $SA] -$NS=[$NS $CJ] -## show $AL -## $oldAL=$AL // for debugging -# WARNING: Fixes for Rule 9 -# Treat X (CM|ZWJ* as if it were X. -# Where X is any line break class except SP, BK, CR, LF, NL or ZW. -$X=$CM* - -# MACROS - -$Spec1_=[$SP $BK $CR $LF $NL $ZW] -$Spec2_=[^ $SP $BK $CR $LF $NL $ZW] -$Spec3a_=[^ $SP $BA $HY $CM] -$Spec3b_=[^ $BA $HY $CM] -$Spec4_=[^ $NU $CM] -$Spec5_=[$BK $CB $CR $LF $NL $SP $ZW] - -# SPECIAL EXTENSIONS - -$AI=($AI $X) -$AK=($AK $X) -$AL=($AL $X) -$AP=($AP $X) -$AS=($AS $X) -$B2=($B2 $X) -$BA=($BA $X) -$BB=($BB $X) -$CB=($CB $X) -$CL=($CL $X) -$CP=($CP $X) -$CM=($CM $X) -## $CM=($CM $X) -$EX=($EX $X) -$GL=($GL $X) -$H2=($H2 $X) -$H3=($H3 $X) -$HL=($HL $X) -$HY=($HY $X) -$ID=($ID $X) -$IN=($IN $X) -$IS=($IS $X) -$JL=($JL $X) -$JT=($JT $X) -$JV=($JV $X) -$NS=($NS $X) -$NU=($NU $X) -$OP=($OP $X) -$PO=($PO $X) -$PR=($PR $X) -$QU=($QU $X) -$SA=($SA $X) -$SG=($SG $X) -$SY=($SY $X) -$VF=($VF $X) -$VI=($VI $X) -$WJ=($WJ $X) -$XX=($XX $X) -$RI=($RI $X) -$EB=($EB $X) -$EM=($EM $X) -$ZWJ=($ZWJ $X) - -$QU_Pi=($QU_Pi $X) -$QU_Pf=($QU_Pi $X) - -$DottedCircle=($DottedCircle $X) - -$CP30=($CP30 $X) -$OP30=($OP30 $X) - -# OUT OF ORDER ON PURPOSE - -# LB 10 Treat any remaining combining mark as AL. -$AL=($AL | ^ $CM | (?<=$Spec1_) $CM) - -# RULES - -# LB 4 Always break after hard line breaks (but never between CR and LF). -4) $BK ÷ -# LB 5 Treat CR followed by LF, as well as CR, LF and NL as hard line breaks. -5.01) $CR × $LF -5.02) $CR ÷ -5.03) $LF ÷ -5.04) $NL ÷ -# LB 6 Do not break before hard line breaks. -6) × ( $BK | $CR | $LF | $NL ) -# LB 7 Do not break before spaces or zero-width space. -7.01) × $SP -7.02) × $ZW -# LB 8 Break before any character following a zero-width space, even if one or more spaces intervene. -8) $ZW $SP* ÷ -# LB 8a Don't break between ZWJ and IDs (for use in Emoji ZWJ sequences) -8.1) $ZWJ_O × -# LB 9 Do not break a combining character sequence; treat it as if it has the LB class of the base character -# in all of the following rules. (Where X is any line break class except SP, BK, CR, LF, NL or ZW.) -9) $Spec2_ × $CM -##WARNING: this is done by modifying the variable values for all but SP.... That is, $AL is really ($AI $CM*)! -## LB 11 Do not break before or after WORD JOINER and related characters. -11.01) × $WJ -11.02) $WJ × -# LB 12 Do not break after NBSP and related characters. -## 12.01) [^$SP] × $GL -12) $GL × -12.1) $Spec3a_ × $GL -12.2) $Spec3b_ $CM+ × $GL -12.3) ^ $CM+ × $GL -# LB 13 Do not break before \u2018]\u2019 or \u2018!\u2019 or \u2018;\u2019 or \u2018/\u2019, even after spaces. -# Using customization 7. -13.01) × $EX -13.02) $Spec4_ × ($CL | $CP | $IS | $SY) -13.03) $Spec4_ $CM+ × ($CL | $CP | $IS | $SY) -13.04) ^ $CM+ × ($CL | $CP | $IS | $SY) -## 13.03) $Spec4_ × $IS -## 13.04) $Spec4_ × $SY -#LB 14 Do not break after \u2018[\u2019, even after spaces. -14) $OP $SP* × -# LB 15a Do not break after an unresolved initial punctuation that lies at the start of the line, -# after a space, after opening punctuation, or after an unresolved quotation mark, even after -# spaces. -15.11) ( $sot | $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW ) $QU_Pi $SP* × -# LB 15b Do not break before an unresolved final punctuation that lies at the end of the line, before -# a space, before a prohibited break, or before an unresolved quotation mark, even before spaces. -15.21) × $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot ) -# LB 16 Do not break between closing punctuation and a nonstarter (lb=NS), even with intervening spaces. -16) ($CL | $CP) $SP* × $NS -# LB 17 Do not break within \u2018\u2014\u2014\u2019, even with intervening spaces. -17) $B2 $SP* × $B2 -# LB 18 Break after spaces. -18) $SP ÷ -# LB 19 Do not break before or after \u2018\"\u2019. -19.01) × $QU -19.02) $QU × -# LB 20 Break before and after unresolved CB. -20.01) ÷ $CB -20.02) $CB ÷ -# LB 20.9 Don't break between Hyphens and Letters when there is a break preceding the hyphen. -# Originally added as a Finnish tailoring, now promoted to default CLDR behavior. -# Must be before LB 21. Note: this is not default UAX #14 behaviour. See ICU issue ICU-8151. -# (Unlike in ICU, here we just check a limited set of known breaks, ignoring some cases like LB 14). -20.09) $Spec5_ $HY × $AL -# LB 21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana and other non-starters, or after acute accents. -21.01) × $BA -21.02) × $HY -21.03) × $NS -21.04) $BB × -# LB 21a Don't break after Hebrew + Hyphen. -21.1) $HL ($HY | $BA) × -# LB 21b Don’t break between Solidus and Hebrew letters. -21.2) $SY × $HL -# LB 22 Do not break before ellipses. -## show $AL -22) × $IN -# LB 23 Do not break between digits and letters. -## 23.01) ($ID | $EB | $EM) × $PO -23.02) ($AL | $HL) × $NU -23.03) $NU × ($AL | $HL) -# LB 24 Do not break between prefix and letters or ideographs. -23.12) $PR × ($ID | $EB | $EM) -23.13) ($ID | $EB | $EM) × $PO -# LB24 Do not break between numeric prefix/postfix and letters, or between letters and prefix/postfix. -24.02) ($PR | $PO) × ($AL | $HL) -24.03) ($AL | $HL) × ($PR | $PO) -# Using customization 7 -# LB Alternative: ( PR | PO) ? ( OP | HY ) ? NU (NU | SY | IS) * (CL | CP) ? ( PR | PO) ? -# Insert × every place it could go. However, make sure that at least one thing is concrete, otherwise would cause $NU to not break before or after -25.01) ($PR | $PO) × ( $OP | $HY )? $NU -25.02) ( $OP | $HY ) × $NU -25.03) $NU × ($NU | $SY | $IS) -25.04) $NU ($NU | $SY | $IS)* × ($NU | $SY | $IS | $CL | $CP) -25.05) $NU ($NU | $SY | $IS)* ($CL | $CP)? × ($PO | $PR) -#LB 26 Do not break a Korean syllable. -26.01) $JL × $JL | $JV | $H2 | $H3 -26.02) $JV | $H2 × $JV | $JT -26.03) $JT | $H3 × $JT -# LB 27 Treat a Korean Syllable Block the same as ID. -27.01) $JL | $JV | $JT | $H2 | $H3 × $PO -27.02) $PR × $JL | $JV | $JT | $H2 | $H3 -# LB 28 Do not break between alphabetics (\"at\"). -28) ($AL | $HL) × ($AL | $HL) -# LB28a Do not break inside the orthographic syllables of Brahmic scripts. -28.11) $AP × ($AK | $DottedCircle | $AS) -28.12) ($AK | $DottedCircle | $AS) × ($VF | $VI) -28.13) ($AK | $DottedCircle | $AS) $VI × ($AK | $DottedCircle) -28.14) ($AK | $DottedCircle | $AS) × ($AK | $DottedCircle | $AS) $VF -# LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\"). -29) $IS × ($AL | $HL) -# LB 30 Do not break between letters, numbers or ordinary symbols and opening or closing punctuation. -30.01) ($AL | $HL | $NU) × $OP30 -30.02) $CP30 × ($AL | $HL | $NU) -# LB 30a Break between two Regional Indicators if and only if there is an even number of them before the point being considered. -30.11) $sot ($RI $RI)* $RI × $RI -30.12) [^$RI] ($RI $RI)* $RI × $RI -30.13) $RI ÷ $RI -# LB 30b Do not break between an emoji base (or potential emoji) and an emoji modifier. -30.21) $EB × $EM -30.22) $ExtPictUnassigned × $EM - -@SentenceBreak - -# VARIABLES - -$CR=\p{Sentence_Break=CR} -$LF=\p{Sentence_Break=LF} -$Extend=\p{Sentence_Break=Extend} -$Format=\p{Sentence_Break=Format} -$Sep=\p{Sentence_Break=Sep} -$Sp=\p{Sentence_Break=Sp} -$Lower=\p{Sentence_Break=Lower} -$Upper=\p{Sentence_Break=Upper} -$OLetter=\p{Sentence_Break=OLetter} -$Numeric=\p{Sentence_Break=Numeric} -$ATerm=\p{Sentence_Break=ATerm} -$STerm=\p{Sentence_Break=STerm} -$Close=\p{Sentence_Break=Close} -$SContinue=\p{Sentence_Break=SContinue} -$Any=. - -# SPECIAL EXTENSIONS - -## subtract Format from Control, since we don't want to break before/after -## $Control=[$Control-$Format] -## Expresses the negation in rule 8; can't do this with normal regex, but works with UnicodeSet, which is all we need. -## $NotStuff=[^$OLetter $Upper $Lower $Sep] -## # $ATerm and $Sterm are temporary, to match ICU until UTC decides. - -# WARNING: For Rule 5, now add format and extend to everything but Sep, Format, and Extend - -$FE=[$Format $Extend] -$NotPreLower_=[^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm] -## $NotSep_=[^ $Sep $CR $LF] -## $FE=$Extend* $Format* -$Sp=($Sp $FE*) -$Lower=($Lower $FE*) -$Upper=($Upper $FE*) -$OLetter=($OLetter $FE*) -$Numeric=($Numeric $FE*) -$ATerm=($ATerm $FE*) -$STerm=($STerm $FE*) -$Close=($Close $FE*) -$SContinue=($SContinue $FE*) - -# MACROS - -$ParaSep = ($Sep | $CR | $LF) -$SATerm = ($STerm | $ATerm) - -# RULES - -# Break at the start and end of text, unless the text is empty. -# Do not break within CRLF. -3) $CR × $LF -# Break after paragraph separators. -4) $ParaSep ÷ -## 3.4) ( $Control | $CR | $LF ) ÷ -## 3.5) ÷ ( $Control | $CR | $LF ) -# Ignore Format and Extend characters, except after sot, ParaSep, and within CRLF. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend) -# WARNING: Implemented as don't break before format (except after linebreaks), -# AND add format and extend in all variables definitions that appear after this point! -## 3.91) [^$Control | $CR | $LF] × $Extend -5) × [$Format $Extend] -# Do not break after full stop in certain contexts. [See note below.] -# Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, -# is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. -# For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence. -6) $ATerm × $Numeric -7) ($Upper | $Lower) $ATerm × $Upper -8) $ATerm $Close* $Sp* × $NotPreLower_* $Lower -8.1) $SATerm $Close* $Sp* × ($SContinue | $SATerm) -# Break after sentence terminators, but include closing punctuation, trailing spaces, and any paragraph separator. [See note below.] Include closing punctuation, trailing spaces, and (optionally) a paragraph separator. -9) $SATerm $Close* × ( $Close | $Sp | $ParaSep ) -# Note the fix to $Sp*, $Sep? -10) $SATerm $Close* $Sp* × ( $Sp | $ParaSep ) -11) $SATerm $Close* $Sp* $ParaSep? ÷ -#Otherwise, do not break -998) × $Any - -@WordBreak - -# VARIABLES - -$CR=\p{Word_Break=CR} -$LF=\p{Word_Break=LF} -$Newline=\p{Word_Break=Newline} -## $Control=\p{Word_Break=Control} -$Extend=\p{Word_Break=Extend} -## $NEWLINE=[$CR $LF \u0085 \u000B \u000C \u2028 \u2029] -## $Sep=\p{Sentence_Break=Sep} -# Now normal variables -$Format=[\p{Word_Break=Format}] -$Katakana=\p{Word_Break=Katakana} -$ALetter=\p{Word_Break=ALetter} -$MidLetter=\p{Word_Break=MidLetter} -$MidNum=\p{Word_Break=MidNum} -$MidNumLet=\p{Word_Break=MidNumLet} -$Numeric=\p{Word_Break=Numeric} -$ExtendNumLet=\p{Word_Break=ExtendNumLet} -$RI=\p{Word_Break=Regional_Indicator} -$Hebrew_Letter=\p{Word_Break=Hebrew_Letter} -$Double_Quote=\p{Word_Break=Double_Quote} -$Single_Quote=\p{Word_Break=Single_Quote} -## $E_Base=\p{Word_Break=E_Base} -## $E_Modifier=\p{Word_Break=E_Modifier} -$ZWJ=\p{Word_Break=ZWJ} -# Note: The following may overlap with the above -$ExtPict=\p{Extended_Pictographic} -## $EBG=\p{Word_Break=E_Base_GAZ} -## $Glue_After_Zwj=\p{Word_Break=Glue_After_Zwj} -$WSegSpace=\p{Word_Break=WSegSpace} - -# MACROS - -$AHLetter=($ALetter | $Hebrew_Letter) -$MidNumLetQ=($MidNumLet | $Single_Quote) -## WARNING: For Rule 4: Fixes for GC, Format -## # Subtract Format from Control, since we don't want to break before/after -## $Control=[$Control-$Format] - -# SPECIAL EXTENSIONS - -# Add format and extend to everything -$FE=[$Format $Extend $ZWJ] - -$NotBreak_=[^ $Newline $CR $LF ] -## $FE= ($Extend | $Format)* -$Katakana=($Katakana $FE*) -$ALetter=($ALetter $FE*) -$MidLetter=($MidLetter $FE*) -$MidNum=($MidNum $FE*) -$MidNumLet=($MidNumLet $FE*) -$Numeric=($Numeric $FE*) -$ExtendNumLet=($ExtendNumLet $FE*) -$RI=($RI $FE*) -$Hebrew_Letter=($Hebrew_Letter $FE*) -$Double_Quote=($Double_Quote $FE*) -$Single_Quote=($Single_Quote $FE*) -## $E_Base=($E_Base $FE*) -## $E_Modifier=($E_Modifier $FE*) -## $ZWJ=($ZWJ $FE*) # don't do this one! -## $Glue_After_Zwj=($Glue_After_Zwj $FE*) -## $EBG=($EBG $FE*) -$AHLetter=($AHLetter $FE*) -$MidNumLetQ=($MidNumLetQ $FE*) - -# RULES - -# Break at the start and end of text, unless the text is empty. -# Do not break within CRLF. -3) $CR × $LF -# Otherwise break before and after Newlines (including CR and LF) -3.1) ($Newline | $CR | $LF) ÷ -3.2) ÷ ($Newline | $CR | $LF) -# Do not break within emoji zwj sequences. -3.3) $ZWJ × $ExtPict -3.4) $WSegSpace × $WSegSpace -## 3.4) ( $Control | $CR | $LF ) ÷ -## 3.5) ÷ ( $Control | $CR | $LF ) -## 3.9) × $Extend -## 3.91) [^$Control | $CR | $LF] × $Extend -# Ignore Format and Extend characters, except after sot, CR, LF, and Newline. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend) -# WARNING: Implemented as don't break before format (except after linebreaks), -# AND add format and extend in all variables definitions that appear after this point! -## 4) × [$Format $Extend] -4) $NotBreak_ × [$Format $Extend $ZWJ] - -# VANILLA RULES - -# Do not break between most letters. -5) $AHLetter × $AHLetter -# Do not break letters across certain punctuation. -6) $AHLetter × ($MidLetter | $MidNumLetQ) $AHLetter -7) $AHLetter ($MidLetter | $MidNumLetQ) × $AHLetter -7.1) $Hebrew_Letter × $Single_Quote -7.2) $Hebrew_Letter × $Double_Quote $Hebrew_Letter -7.3) $Hebrew_Letter $Double_Quote × $Hebrew_Letter -# Do not break within sequences of digits, or digits adjacent to letters (“3a”, or “A3”). -8) $Numeric × $Numeric -9) $AHLetter × $Numeric -10) $Numeric × $AHLetter -# Do not break within sequences, such as “3.2” or “3,456.789”. -11) $Numeric ($MidNum | $MidNumLetQ) × $Numeric -12) $Numeric × ($MidNum | $MidNumLetQ) $Numeric -# Do not break between Katakana. -13) $Katakana × $Katakana -# Do not break from extenders. -13.1) ($AHLetter | $Numeric | $Katakana | $ExtendNumLet) × $ExtendNumLet -13.2) $ExtendNumLet × ($AHLetter | $Numeric | $Katakana) -## Do not break within emoji modifier sequences. -## 14) $E_Base × $E_Modifier -# Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point. -15) ^ ($RI $RI)* $RI × $RI -16) [^$RI] ($RI $RI)* $RI × $RI -# Otherwise, break everywhere (including around ideographs). diff --git a/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt b/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt index a57c184fa..e7e6193bd 100644 --- a/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt +++ b/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt @@ -1,6 +1,5 @@ @GraphemeClusterBreak ## double ## at the start of a line doesn't show up -## use Segmenter.Target.FOR_CLDR to select SegmenterCldr.txt # VARIABLES @@ -157,7 +156,6 @@ $Spec2_=[^ $SP $BK $CR $LF $NL $ZW] $Spec3a_=[^ $SP $BA $HY $CM] $Spec3b_=[^ $BA $HY $CM] $Spec4_=[^ $NU $CM] -##CLDR: $Spec5_=[$BK $CB $CR $LF $NL $SP $ZW] # SPECIAL EXTENSIONS From 69a376c10bff00dd683b6700d0f2e6dc0c7b1c38 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 15 Oct 2024 21:25:43 +0200 Subject: [PATCH 2/2] Python 3.8 reached eol (#948) --- .github/workflows/pythonpackage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 8317ccb34..22c9beb61 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8] + python-version: [3.12] steps: - uses: actions/checkout@v3