From 4af53431bf47f89f0264d7c19f3525606767e2eb Mon Sep 17 00:00:00 2001
From: Robin Leroy <eggrobin@unicode.org>
Date: Mon, 14 Oct 2024 13:26:46 +0200
Subject: [PATCH 1/2] Remove SegmenterCldr.txt (#947)

---
 .../unicode/text/UCD/GenerateBreakTest.java   |   2 -
 .../unicode/text/UCD/MakeUnicodeFiles.java    |   5 -
 .../org/unicode/text/UCD/MakeUnicodeFiles.txt |   3 -
 .../org/unicode/tools/SegmenterCldr.txt       | 529 ------------------
 .../org/unicode/tools/SegmenterDefault.txt    |   2 -
 5 files changed, 541 deletions(-)
 delete mode 100644 unicodetools/src/main/resources/org/unicode/tools/SegmenterCldr.txt

diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java
index 5d2bc8373..ff506b4ee 100644
--- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java
+++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java
@@ -81,8 +81,6 @@ public static void main(String[] args) throws IOException {
         new GenerateWordBreakTest(ucd, Segmenter.Target.FOR_UCD).run();
         new GenerateLineBreakTest(ucd, Segmenter.Target.FOR_UCD).run();
         new GenerateSentenceBreakTest(ucd, Segmenter.Target.FOR_UCD).run();
-
-        new GenerateGraphemeBreakTest(ucd, Segmenter.Target.FOR_CLDR).run();
     }
 
     GenerateBreakTest(UCD ucd, Segmenter seg) {
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java
index 0c14d0f2d..7fea779e6 100644
--- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java
+++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java
@@ -599,11 +599,6 @@ public static void generateFile(String filename) throws IOException {
                                     Default.ucd(), Segmenter.Target.FOR_UCD)
                             .run();
                     break;
-                case "GraphemeBreakTest-cldr":
-                    new GenerateBreakTest.GenerateGraphemeBreakTest(
-                                    Default.ucd(), Segmenter.Target.FOR_CLDR)
-                            .run();
-                    break;
                 case "DerivedName":
                 case "DerivedLabel":
                     generateDerivedName(filename);
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt
index 3a3b5d608..357d1bf6d 100644
--- a/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt
+++ b/unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt
@@ -92,9 +92,6 @@ Property: SPECIAL
 File: auxiliary/SentenceBreakTest
 Property: SPECIAL
 
-File: cldr/GraphemeBreakTest-cldr
-Property: SPECIAL
-
 File:	extra/BidiPairedBracketType
 Property: BidiPairedBracketType
 Format:	valueList skipUnassigned=None
diff --git a/unicodetools/src/main/resources/org/unicode/tools/SegmenterCldr.txt b/unicodetools/src/main/resources/org/unicode/tools/SegmenterCldr.txt
deleted file mode 100644
index 5a2f4c8bf..000000000
--- a/unicodetools/src/main/resources/org/unicode/tools/SegmenterCldr.txt
+++ /dev/null
@@ -1,529 +0,0 @@
-@GraphemeClusterBreak
-## double ## at the start of a line doesn't show up
-## use Segmenter.Target.FOR_CLDR to select SegmenterCldr.txt
-
-# VARIABLES
-
-$CR=\p{Grapheme_Cluster_Break=CR}
-$LF=\p{Grapheme_Cluster_Break=LF}
-$Control=\p{Grapheme_Cluster_Break=Control}
-$Extend=\p{Grapheme_Cluster_Break=Extend}
-$ZWJ=\p{Grapheme_Cluster_Break=ZWJ}
-$RI=\p{Grapheme_Cluster_Break=Regional_Indicator}
-$Prepend=\p{Grapheme_Cluster_Break=Prepend}
-$SpacingMark=\p{Grapheme_Cluster_Break=SpacingMark}
-$L=\p{Grapheme_Cluster_Break=L}
-$V=\p{Grapheme_Cluster_Break=V}
-$T=\p{Grapheme_Cluster_Break=T}
-$LV=\p{Grapheme_Cluster_Break=LV}
-$LVT=\p{Grapheme_Cluster_Break=LVT}
-# Note: The following may overlap with the above
-# Note: ConjunctLinkingScripts is not used anymore, instead that list exists in the derivation of Indic_Conjunct_Break.
-# It is kept here so that the diff of the generated test cases compared to the Unicode 15.1 β is minimal.
-# TODO(egg): Consider removing in Unicode 16.0.
-$ConjunctLinkingScripts=[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}]
-$ConjunctLinker=\p{Indic_Conjunct_Break=Linker}
-$LinkingConsonant=\p{Indic_Conjunct_Break=Consonant}
-##	$E_Base=\p{Grapheme_Cluster_Break=E_Base}
-##	$E_Modifier=\p{Grapheme_Cluster_Break=E_Modifier}
-$ExtPict=\p{Extended_Pictographic}
-$ExtCccZwj=[\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}]
-##	$EBG=\p{Grapheme_Cluster_Break=E_Base_GAZ}
-##	$Glue_After_Zwj=\p{Grapheme_Cluster_Break=Glue_After_Zwj}
-
-# RULES
-
-# Break at the start and end of text, unless the text is empty.
-# Do not break between a CR and LF. Otherwise, break before and after controls.
-3) $CR  	×  	$LF
-4) ( $Control | $CR | $LF ) 	÷
-5) ÷ 	( $Control | $CR | $LF )
-# Do not break Hangul syllable sequences.
-6) $L 	× 	( $L | $V | $LV | $LVT )
-7) ( $LV | $V ) 	× 	( $V | $T )
-8) ( $LVT | $T)    ×  $T
-## Do not break before extending characters or ZWJ.
-##	9) × 	($Extend | $ZWJ | $ConjunctLinker)
-9) × 	($Extend | $ZWJ)
-# Only for extended grapheme clusters: Do not break before SpacingMarks, or after Prepend characters.
-9.1) × 	$SpacingMark
-9.2) $Prepend  ×
-9.3) $LinkingConsonant $ExtCccZwj* $ConjunctLinker $ExtCccZwj*  × $LinkingConsonant
-## Do not break within emoji modifier sequences or emoji zwj sequences.
-##	10) $E_Base $Extend* × $E_Modifier
-11) $ExtPict $Extend* $ZWJ × $ExtPict
-# Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point.
-12) ^ ($RI $RI)* $RI × $RI
-13) [^$RI] ($RI $RI)* $RI × $RI
-# Otherwise, break everywhere.
-
-@LineBreak
-
-# VARIABLES
-
-$AI=\p{Line_Break=Ambiguous}
-$AK=\p{Line_Break=Aksara}
-$AL=\p{Line_Break=Alphabetic}
-$AP=\p{Line_Break=Aksara_Prebase}
-$AS=\p{Line_Break=Aksara_Start}
-$B2=\p{Line_Break=Break_Both}
-$BA=\p{Line_Break=Break_After}
-$BB=\p{Line_Break=Break_Before}
-$BK=\p{Line_Break=Mandatory_Break}
-$CB=\p{Line_Break=Contingent_Break}
-$CL=\p{Line_Break=Close_Punctuation}
-$CP=\p{Line_Break=CP}
-$CM1=\p{Line_Break=Combining_Mark}
-$CR=\p{Line_Break=Carriage_Return}
-$EX=\p{Line_Break=Exclamation}
-$GL=\p{Line_Break=Glue}
-$H2=\p{Line_Break=H2}
-$H3=\p{Line_Break=H3}
-$HL=\p{Line_Break=HL}
-$HY=\p{Line_Break=Hyphen}
-$ID=\p{Line_Break=Ideographic}
-$IN=\p{Line_Break=Inseparable}
-$IS=\p{Line_Break=Infix_Numeric}
-$JL=\p{Line_Break=JL}
-$JT=\p{Line_Break=JT}
-$JV=\p{Line_Break=JV}
-$LF=\p{Line_Break=Line_Feed}
-$NL=\p{Line_Break=Next_Line}
-$NS=\p{Line_Break=Nonstarter}
-$NU=\p{Line_Break=Numeric}
-$OP=\p{Line_Break=Open_Punctuation}
-$PO=\p{Line_Break=Postfix_Numeric}
-$PR=\p{Line_Break=Prefix_Numeric}
-$QU=\p{Line_Break=Quotation}
-$SA=\p{Line_Break=Complex_Context}
-$SG=\p{Line_Break=Surrogate}
-$SP=\p{Line_Break=Space}
-$SY=\p{Line_Break=Break_Symbols}
-$VF=\p{Line_Break=Virama_Final}
-$VI=\p{Line_Break=Virama}
-$WJ=\p{Line_Break=Word_Joiner}
-$XX=\p{Line_Break=Unknown}
-$ZW=\p{Line_Break=ZWSpace}
-$CJ=\p{Line_Break=Conditional_Japanese_Starter}
-$RI=\p{Line_Break=Regional_Indicator}
-$EB=\p{Line_Break=E_Base}
-$EM=\p{Line_Break=E_Modifier}
-$ZWJ_O=\p{Line_Break=ZWJ}
-$ZWJ=\p{Line_Break=ZWJ}
-
-$QU_Pi=($QU_Pi $X)
-$QU_Pf=($QU_Pf $X)
-
-$DottedCircle = ◌
-
-$CP30=[$CP-[\p{ea=F}\p{ea=W}\p{ea=H}]]
-$OP30=[$OP-[\p{ea=F}\p{ea=W}\p{ea=H}]]
-
-$ExtPictUnassigned=[\p{Extended_Pictographic}&\p{gc=Cn}]
-
-# Some rules refer to the start and end of text.  We could just use a literal ^ for sot, but naming
-# it as in the spec makes it easier to compare.  The parser will eat (and choke on) $, so we play a
-# stupid trick instead.
-$sot=^
-$eot=(?!.)
-
-# SPECIAL EXTENSIONS
-
-$CM=[$CM1 $ZWJ]
-# LB 1  Assign a line breaking class to each code point of the input. 
-# Resolve AI, CB, SA, SG, and XX into other line breaking classes depending on criteria outside the scope of this algorithm.
-# NOTE: CB is ok to fall through, but must handle others here.
-##	show $AL
-$AL=[$AI $AL $SG $XX $SA]
-$NS=[$NS $CJ]
-##	show $AL
-##	$oldAL=$AL // for debugging
-# WARNING: Fixes for Rule 9
-# Treat X (CM|ZWJ* as if it were X.
-# Where X is any line break class except SP, BK, CR, LF, NL or ZW.
-$X=$CM*
-
-# MACROS
-
-$Spec1_=[$SP $BK $CR $LF $NL $ZW]
-$Spec2_=[^ $SP $BK $CR $LF $NL $ZW]
-$Spec3a_=[^ $SP $BA $HY $CM]
-$Spec3b_=[^ $BA $HY $CM]
-$Spec4_=[^ $NU $CM]
-$Spec5_=[$BK $CB $CR $LF $NL $SP $ZW]
-
-# SPECIAL EXTENSIONS
-
-$AI=($AI $X)
-$AK=($AK $X)
-$AL=($AL $X)
-$AP=($AP $X)
-$AS=($AS $X)
-$B2=($B2 $X)
-$BA=($BA $X)
-$BB=($BB $X)
-$CB=($CB $X)
-$CL=($CL $X)
-$CP=($CP $X)
-$CM=($CM $X)
-##	$CM=($CM $X)
-$EX=($EX $X)
-$GL=($GL $X)
-$H2=($H2 $X)
-$H3=($H3 $X)
-$HL=($HL $X)
-$HY=($HY $X)
-$ID=($ID $X)
-$IN=($IN $X)
-$IS=($IS $X)
-$JL=($JL $X)
-$JT=($JT $X)
-$JV=($JV $X)
-$NS=($NS $X)
-$NU=($NU $X)
-$OP=($OP $X)
-$PO=($PO $X)
-$PR=($PR $X)
-$QU=($QU $X)
-$SA=($SA $X)
-$SG=($SG $X)
-$SY=($SY $X)
-$VF=($VF $X)
-$VI=($VI $X)
-$WJ=($WJ $X)
-$XX=($XX $X)
-$RI=($RI $X)
-$EB=($EB $X)
-$EM=($EM $X)
-$ZWJ=($ZWJ $X)
-
-$QU_Pi=($QU_Pi $X)
-$QU_Pf=($QU_Pi $X)
-
-$DottedCircle=($DottedCircle $X)
-
-$CP30=($CP30 $X)
-$OP30=($OP30 $X)
-
-# OUT OF ORDER ON PURPOSE
-
-# LB 10  Treat any remaining combining mark as AL.
-$AL=($AL | ^ $CM | (?<=$Spec1_) $CM)
-
-# RULES
-
-# LB 4  Always break after hard line breaks (but never between CR and LF).
-4) $BK ÷
-# LB 5  Treat CR followed by LF, as well as CR, LF and NL as hard line breaks.
-5.01) $CR × $LF
-5.02) $CR ÷
-5.03) $LF ÷
-5.04) $NL ÷
-# LB 6  Do not break before hard line breaks.
-6) × ( $BK | $CR | $LF | $NL )
-# LB 7  Do not break before spaces or zero-width space.
-7.01) × $SP
-7.02) × $ZW
-# LB 8  Break before any character following a zero-width space, even if one or more spaces intervene.
-8) $ZW $SP* ÷
-# LB 8a  Don't break between ZWJ and IDs (for use in Emoji ZWJ sequences)
-8.1) $ZWJ_O ×
-# LB 9  Do not break a combining character sequence; treat it as if it has the LB class of the base character
-# in all of the following rules. (Where X is any line break class except SP, BK, CR, LF, NL or ZW.)
-9) $Spec2_ × $CM
-##WARNING: this is done by modifying the variable values for all but SP.... That is, $AL is really ($AI $CM*)!
-## LB 11  Do not break before or after WORD JOINER and related characters.
-11.01) × $WJ
-11.02) $WJ ×
-# LB 12  Do not break after NBSP and related characters.
-##	12.01) [^$SP] × $GL
-12) $GL ×
-12.1) $Spec3a_ × $GL
-12.2) $Spec3b_ $CM+ × $GL
-12.3) ^ $CM+ × $GL
-# LB 13  Do not break before \u2018]\u2019 or \u2018!\u2019 or \u2018;\u2019 or \u2018/\u2019, even after spaces.
-# Using customization 7.
-13.01) × $EX
-13.02) $Spec4_ × ($CL | $CP | $IS | $SY)
-13.03) $Spec4_ $CM+ ×  ($CL | $CP | $IS | $SY)
-13.04) ^ $CM+ ×  ($CL | $CP | $IS | $SY)
-##	13.03) $Spec4_ × $IS
-##	13.04) $Spec4_ × $SY
-#LB 14  Do not break after \u2018[\u2019, even after spaces.
-14) $OP $SP* ×
-# LB 15a Do not break after an unresolved initial punctuation that lies at the start of the line,
-# after a space, after opening punctuation, or after an unresolved quotation mark, even after
-# spaces.
-15.11) ( $sot | $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW ) $QU_Pi $SP* ×
-# LB 15b Do not break before an unresolved final punctuation that lies at the end of the line, before
-# a space, before a prohibited break, or before an unresolved quotation mark, even before spaces.
-15.21) × $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot )
-# LB 16  Do not break between closing punctuation and a nonstarter (lb=NS), even with intervening spaces.
-16) ($CL | $CP) $SP* × $NS
-# LB 17  Do not break within \u2018\u2014\u2014\u2019, even with intervening spaces.
-17) $B2 $SP* × $B2
-# LB 18  Break after spaces.
-18) $SP ÷
-# LB 19  Do not break before or after \u2018\"\u2019.
-19.01)  × $QU
-19.02) $QU ×
-# LB 20  Break before and after unresolved CB.
-20.01)  ÷ $CB
-20.02) $CB ÷
-# LB 20.9  Don't break between Hyphens and Letters when there is a break preceding the hyphen.
-# Originally added as a Finnish tailoring, now promoted to default CLDR behavior.
-# Must be before LB 21. Note: this is not default UAX #14 behaviour. See ICU issue ICU-8151.
-# (Unlike in ICU, here we just check a limited set of known breaks, ignoring some cases like LB 14).
-20.09) $Spec5_ $HY × $AL
-# LB 21  Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana and other non-starters, or after acute accents.
-21.01) × $BA
-21.02) × $HY
-21.03) × $NS
-21.04) $BB ×
-# LB 21a  Don't break after Hebrew + Hyphen.
-21.1) $HL ($HY | $BA) ×
-# LB 21b Don’t break between Solidus and Hebrew letters.
-21.2) $SY × $HL
-# LB 22  Do not break before ellipses.
-##	show $AL
-22) × $IN
-# LB 23  Do not break between digits and letters.
-##	23.01) ($ID | $EB | $EM) × $PO
-23.02) ($AL | $HL) × $NU
-23.03) $NU × ($AL | $HL)
-# LB 24  Do not break between prefix and letters or ideographs.
-23.12) $PR × ($ID | $EB | $EM)
-23.13) ($ID | $EB | $EM) × $PO
-# LB24 Do not break between numeric prefix/postfix and letters, or between letters and prefix/postfix.
-24.02) ($PR | $PO) × ($AL | $HL)
-24.03) ($AL | $HL) × ($PR | $PO)
-# Using customization 7
-# LB Alternative: ( PR | PO) ? ( OP | HY ) ? NU (NU | SY | IS) * (CL | CP) ? ( PR | PO) ?
-# Insert × every place it could go. However, make sure that at least one thing is concrete, otherwise would cause $NU to not break before or after 
-25.01) ($PR | $PO) × ( $OP | $HY )? $NU
-25.02) ( $OP | $HY ) × $NU
-25.03) $NU × ($NU | $SY | $IS)
-25.04) $NU ($NU | $SY | $IS)* × ($NU | $SY | $IS | $CL | $CP)
-25.05) $NU ($NU | $SY | $IS)* ($CL | $CP)? × ($PO | $PR)
-#LB 26 Do not break a Korean syllable.
-26.01) $JL  × $JL | $JV | $H2 | $H3
-26.02) $JV | $H2 × $JV | $JT
-26.03) $JT | $H3 × $JT
-# LB 27 Treat a Korean Syllable Block the same as ID.
-27.01) $JL | $JV | $JT | $H2 | $H3  × $PO
-27.02) $PR × $JL | $JV | $JT | $H2 | $H3
-# LB 28  Do not break between alphabetics (\"at\").
-28) ($AL | $HL) × ($AL | $HL)
-# LB28a Do not break inside the orthographic syllables of Brahmic scripts.
-28.11) $AP × ($AK | $DottedCircle | $AS)
-28.12) ($AK | $DottedCircle | $AS) × ($VF | $VI)
-28.13) ($AK | $DottedCircle | $AS) $VI × ($AK | $DottedCircle)
-28.14) ($AK | $DottedCircle | $AS) × ($AK | $DottedCircle | $AS) $VF
-# LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").
-29) $IS × ($AL | $HL)
-# LB 30  Do not break between letters, numbers or ordinary symbols and opening or closing punctuation.
-30.01) ($AL | $HL | $NU) × $OP30
-30.02) $CP30 × ($AL | $HL | $NU)
-# LB 30a  Break between two Regional Indicators if and only if there is an even number of them before the point being considered.
-30.11) $sot ($RI $RI)* $RI × $RI
-30.12) [^$RI] ($RI $RI)* $RI × $RI
-30.13) $RI ÷ $RI
-# LB 30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
-30.21) $EB × $EM
-30.22) $ExtPictUnassigned × $EM
-
-@SentenceBreak
-
-# VARIABLES
-
-$CR=\p{Sentence_Break=CR}
-$LF=\p{Sentence_Break=LF}
-$Extend=\p{Sentence_Break=Extend}
-$Format=\p{Sentence_Break=Format}
-$Sep=\p{Sentence_Break=Sep}
-$Sp=\p{Sentence_Break=Sp}
-$Lower=\p{Sentence_Break=Lower}
-$Upper=\p{Sentence_Break=Upper}
-$OLetter=\p{Sentence_Break=OLetter}
-$Numeric=\p{Sentence_Break=Numeric}
-$ATerm=\p{Sentence_Break=ATerm}
-$STerm=\p{Sentence_Break=STerm}
-$Close=\p{Sentence_Break=Close}
-$SContinue=\p{Sentence_Break=SContinue}
-$Any=.
-
-# SPECIAL EXTENSIONS
-
-## subtract Format from Control, since we don't want to break before/after
-##	$Control=[$Control-$Format]
-## Expresses the negation in rule 8; can't do this with normal regex, but works with UnicodeSet, which is all we need.
-##	$NotStuff=[^$OLetter $Upper $Lower $Sep]
-##	# $ATerm and $Sterm are temporary, to match ICU until UTC decides.
-
-# WARNING: For Rule 5, now add format and extend to everything but Sep, Format, and Extend
-
-$FE=[$Format $Extend]
-$NotPreLower_=[^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm]
-##	$NotSep_=[^ $Sep $CR $LF]
-##	$FE=$Extend* $Format*
-$Sp=($Sp $FE*)
-$Lower=($Lower $FE*)
-$Upper=($Upper $FE*)
-$OLetter=($OLetter $FE*)
-$Numeric=($Numeric $FE*)
-$ATerm=($ATerm $FE*)
-$STerm=($STerm $FE*)
-$Close=($Close $FE*)
-$SContinue=($SContinue $FE*)
-
-# MACROS
-
-$ParaSep = ($Sep | $CR | $LF)
-$SATerm = ($STerm | $ATerm)
-
-# RULES
-
-# Break at the start and end of text, unless the text is empty.
-# Do not break within CRLF.
-3) $CR  	×  	$LF
-# Break after paragraph separators.
-4) $ParaSep  	÷
-##	3.4) ( $Control | $CR | $LF ) 	÷
-##	3.5) ÷ 	( $Control | $CR | $LF )
-# Ignore Format and Extend characters, except after sot, ParaSep, and within CRLF. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend)
-# WARNING: Implemented as don't break before format (except after linebreaks),
-# AND add format and extend in all variables definitions that appear after this point!
-##	3.91) [^$Control | $CR | $LF] × 	$Extend
-5) × [$Format $Extend]
-# Do not break after full stop in certain contexts. [See note below.]
-# Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter,
-# is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase.
-# For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
-6) $ATerm 	× 	$Numeric
-7) ($Upper | $Lower) $ATerm 	× 	$Upper
-8) $ATerm $Close* $Sp* 	× 	$NotPreLower_* $Lower
-8.1) $SATerm $Close* $Sp* 	× 	($SContinue | $SATerm)
-# Break after sentence terminators, but include closing punctuation, trailing spaces, and any paragraph separator. [See note below.] Include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
-9) $SATerm $Close* 	× 	( $Close | $Sp | $ParaSep )
-# Note the fix to $Sp*, $Sep?
-10) $SATerm $Close* $Sp* 	× 	( $Sp | $ParaSep )
-11) $SATerm $Close* $Sp* $ParaSep? ÷
-#Otherwise, do not break
-998) × 	$Any
-
-@WordBreak
-
-# VARIABLES
-
-$CR=\p{Word_Break=CR}
-$LF=\p{Word_Break=LF}
-$Newline=\p{Word_Break=Newline}
-##	$Control=\p{Word_Break=Control}
-$Extend=\p{Word_Break=Extend}
-##	$NEWLINE=[$CR $LF \u0085 \u000B \u000C \u2028 \u2029]
-##	$Sep=\p{Sentence_Break=Sep}
-# Now normal variables
-$Format=[\p{Word_Break=Format}]
-$Katakana=\p{Word_Break=Katakana}
-$ALetter=\p{Word_Break=ALetter}
-$MidLetter=\p{Word_Break=MidLetter}
-$MidNum=\p{Word_Break=MidNum}
-$MidNumLet=\p{Word_Break=MidNumLet}
-$Numeric=\p{Word_Break=Numeric}
-$ExtendNumLet=\p{Word_Break=ExtendNumLet}
-$RI=\p{Word_Break=Regional_Indicator}
-$Hebrew_Letter=\p{Word_Break=Hebrew_Letter}
-$Double_Quote=\p{Word_Break=Double_Quote}
-$Single_Quote=\p{Word_Break=Single_Quote}
-##	$E_Base=\p{Word_Break=E_Base}
-##	$E_Modifier=\p{Word_Break=E_Modifier}
-$ZWJ=\p{Word_Break=ZWJ}
-# Note: The following may overlap with the above
-$ExtPict=\p{Extended_Pictographic}
-##	$EBG=\p{Word_Break=E_Base_GAZ}
-##	$Glue_After_Zwj=\p{Word_Break=Glue_After_Zwj}
-$WSegSpace=\p{Word_Break=WSegSpace}
-
-# MACROS
-
-$AHLetter=($ALetter | $Hebrew_Letter)
-$MidNumLetQ=($MidNumLet | $Single_Quote)
-## WARNING: For Rule 4: Fixes for GC, Format
-##	# Subtract Format from Control, since we don't want to break before/after
-##	$Control=[$Control-$Format]
-
-# SPECIAL EXTENSIONS
-
-# Add format and extend to everything
-$FE=[$Format $Extend $ZWJ]
-
-$NotBreak_=[^ $Newline $CR $LF ]
-##	$FE= ($Extend | $Format)*
-$Katakana=($Katakana $FE*)
-$ALetter=($ALetter $FE*)
-$MidLetter=($MidLetter $FE*)
-$MidNum=($MidNum $FE*)
-$MidNumLet=($MidNumLet $FE*)
-$Numeric=($Numeric $FE*)
-$ExtendNumLet=($ExtendNumLet $FE*)
-$RI=($RI $FE*)
-$Hebrew_Letter=($Hebrew_Letter $FE*)
-$Double_Quote=($Double_Quote $FE*)
-$Single_Quote=($Single_Quote $FE*)
-##	$E_Base=($E_Base $FE*)
-##	$E_Modifier=($E_Modifier $FE*)
-##	$ZWJ=($ZWJ $FE*) # don't do this one!
-##	$Glue_After_Zwj=($Glue_After_Zwj $FE*)
-##	$EBG=($EBG $FE*)
-$AHLetter=($AHLetter $FE*)
-$MidNumLetQ=($MidNumLetQ $FE*)
-
-# RULES
-
-# Break at the start and end of text, unless the text is empty.
-# Do not break within CRLF.
-3) $CR  	×  	$LF
-# Otherwise break before and after Newlines (including CR and LF)
-3.1) ($Newline | $CR | $LF)	÷
-3.2) ÷    ($Newline | $CR | $LF)
-# Do not break within emoji zwj sequences.
-3.3) $ZWJ × $ExtPict
-3.4) $WSegSpace × $WSegSpace
-##	3.4) ( $Control | $CR | $LF ) 	÷
-##	3.5) ÷ 	( $Control | $CR | $LF )
-##	3.9) × 	$Extend
-##	3.91) [^$Control | $CR | $LF] × 	$Extend
-# Ignore Format and Extend characters, except after sot, CR, LF, and Newline. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend)
-# WARNING: Implemented as don't break before format (except after linebreaks),
-# AND add format and extend in all variables definitions that appear after this point!
-##	4) × [$Format $Extend]
-4) $NotBreak_ × [$Format $Extend $ZWJ]
-
-# VANILLA RULES
-
-# Do not break between most letters.
-5) $AHLetter  	×  	$AHLetter
-# Do not break letters across certain punctuation.
-6) $AHLetter 	× 	($MidLetter | $MidNumLetQ) $AHLetter
-7) $AHLetter ($MidLetter | $MidNumLetQ) 	× 	$AHLetter
-7.1) $Hebrew_Letter × $Single_Quote
-7.2) $Hebrew_Letter × $Double_Quote $Hebrew_Letter
-7.3) $Hebrew_Letter $Double_Quote × $Hebrew_Letter
-# Do not break within sequences of digits, or digits adjacent to letters (“3a”, or “A3”).
-8) $Numeric 	× 	$Numeric
-9) $AHLetter 	× 	$Numeric
-10) $Numeric 	× 	$AHLetter
-# Do not break within sequences, such as “3.2” or “3,456.789”.
-11) $Numeric ($MidNum | $MidNumLetQ) 	× 	$Numeric
-12) $Numeric 	× 	($MidNum | $MidNumLetQ) $Numeric
-# Do not break between Katakana.
-13) $Katakana 	× 	$Katakana
-# Do not break from extenders.
-13.1) ($AHLetter | $Numeric | $Katakana | $ExtendNumLet) 	× 	$ExtendNumLet
-13.2) $ExtendNumLet 	× 	($AHLetter | $Numeric | $Katakana)
-## Do not break within emoji modifier sequences.
-##	14) $E_Base × $E_Modifier
-# Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point.
-15) ^ ($RI $RI)* $RI × $RI
-16) [^$RI] ($RI $RI)* $RI × $RI
-# Otherwise, break everywhere (including around ideographs).
diff --git a/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt b/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt
index a57c184fa..e7e6193bd 100644
--- a/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt
+++ b/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt
@@ -1,6 +1,5 @@
 @GraphemeClusterBreak
 ## double ## at the start of a line doesn't show up
-## use Segmenter.Target.FOR_CLDR to select SegmenterCldr.txt
 
 # VARIABLES
 
@@ -157,7 +156,6 @@ $Spec2_=[^ $SP $BK $CR $LF $NL $ZW]
 $Spec3a_=[^ $SP $BA $HY $CM]
 $Spec3b_=[^ $BA $HY $CM]
 $Spec4_=[^ $NU $CM]
-##CLDR: $Spec5_=[$BK $CB $CR $LF $NL $SP $ZW]
 
 # SPECIAL EXTENSIONS
 

From 69a376c10bff00dd683b6700d0f2e6dc0c7b1c38 Mon Sep 17 00:00:00 2001
From: Robin Leroy <eggrobin@unicode.org>
Date: Tue, 15 Oct 2024 21:25:43 +0200
Subject: [PATCH 2/2] Python 3.8 reached eol (#948)

---
 .github/workflows/pythonpackage.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index 8317ccb34..22c9beb61 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: [3.12]
 
     steps:
     - uses: actions/checkout@v3