diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java index d809a1659..59a52e436 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java @@ -15,6 +15,7 @@ import java.text.ParseException; import java.text.ParsePosition; import java.util.ArrayList; +import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; @@ -42,7 +43,6 @@ public class TestUnicodeInvariants { private static final boolean DEBUG = false; - // private static final Pattern IN_PATTERN = Pattern.compile("(.*)([≠=])(.*)"); private static final boolean ICU_VERSION = false; // ignore the versions if this is true private static final Factory LATEST_PROPS = getProperties(Settings.latestVersion); private static final boolean SHOW_LOOKUP = false; @@ -690,9 +690,13 @@ private static void inLine( } } - // A one-token lookahead. - // Tokens are defined as runs of [^\p{Pattern_White_Space}\p{Pattern_Syntax}], - // or single code points in \p{Pattern_Syntax}. + /** + * A one-token lookahead. Tokens are defined as: 1. words: runs of + * [^\p{Pattern_White_Space}\p{Pattern_Syntax}]; 2. simple operators: sequences of the form + * \p{Pattern_Syntax} \p{Mn}*; 3. explicitly expected sequences of words and simple operators + * without intervening spaces; this allows for contextually accepting operators such as :=, >>, + * ’s, or .GT., without treating, e.g., every >> as atomic. + */ private static class Lookahead { // Advances pp through any pattern white space, then looks ahead one token. public static Lookahead oneToken(ParsePosition pp, String text) { @@ -700,7 +704,43 @@ public static Lookahead oneToken(ParsePosition pp, String text) { return oneTokenNoSpace(pp, text); } - // Returns null if pp is before pattern white space; otherwise, looks ahead one token. + /** + * Advances pp through any pattern white space, then looks ahead one token, treating the + * given sequences as single tokens. + */ + public static Lookahead oneToken(ParsePosition pp, String text, String... sequences) { + scan(PATTERN_WHITE_SPACE, text, pp, true); + Lookahead result = oneTokenNoSpace(pp, text); + if (result == null) { + return result; + } + Lookahead candidate = result; + for (; ; ) { + final String candidateToken = candidate.token; + final boolean candidateIsSequencePrefix = + Arrays.asList(sequences).stream() + .anyMatch(s -> s.startsWith(candidateToken)); + if (!candidateIsSequencePrefix) { + break; + } + final Lookahead continuation = oneTokenNoSpace(candidate.next, text); + if (continuation == null) { + break; + } + candidate = + new Lookahead(candidateToken + continuation.token, pp, continuation.next); + + if (Arrays.asList(sequences).contains(candidate.token)) { + result = candidate; + } + } + return result; + } + + /** + * Returns null if pp is before pattern white space; otherwise, looks ahead one token. This + * function does not alter pp. + */ public static Lookahead oneTokenNoSpace(ParsePosition pp, String text) { ParsePosition next = new ParsePosition(pp.getIndex()); if (next.getIndex() == text.length()) { @@ -708,9 +748,10 @@ public static Lookahead oneTokenNoSpace(ParsePosition pp, String text) { } int start = next.getIndex(); if (PATTERN_SYNTAX.contains(text.codePointAt(start))) { - final String result = Character.toString(text.codePointAt(start)); - next.setIndex(start + result.length()); - return new Lookahead(result, pp, next); + final String syntax = Character.toString(text.codePointAt(start)); + next.setIndex(start + syntax.length()); + final String marks = scan(NONSPACING_MARK, text, next, true); + return new Lookahead(syntax + marks, pp, next); } else { final String result = scan(PATTERN_SYNTAX_OR_WHITE_SPACE, text, next, false); return result.isEmpty() ? null : new Lookahead(result, pp, next); @@ -723,15 +764,18 @@ private Lookahead(String token, ParsePosition pp, ParsePosition next) { this.next = next; } - // Advances the ParsePosition passed at construction past the token, and returns the token. + /** + * Advances the ParsePosition passed at construction past the token, and returns the token. + */ public String consume() { pp.setIndex(next.getIndex()); return token; } - // If this token is expected, advances the ParsePosition passed at construction past the - // token past it and returns true. - // Otherwise, this function no effect and returns false. + /** + * If this token is expected, advances the ParsePosition passed at construction past the + * token past it and returns true. Otherwise, this function no effect and returns false. + */ public boolean accept(String expected) { if (expected.equals(token)) { consume(); @@ -748,8 +792,10 @@ public boolean accept(String expected) { private static void expectToken(String token, ParsePosition pp, String text) throws ParseException { - if (!Lookahead.oneToken(pp, text).accept(token)) { - throw new ParseException("Expected '" + token + "'", pp.getIndex()); + final var lookahead = Lookahead.oneToken(pp, text, token); + if (!lookahead.accept(token)) { + throw new ParseException( + "Expected '" + token + "', got '" + lookahead.token + "'", pp.getIndex()); } } @@ -1069,7 +1115,7 @@ protected String _getVersion() { private static void letLine(ParsePosition pp, String source) throws ParseException { expectToken("$", pp, source); final String variable = Lookahead.oneTokenNoSpace(pp, source).consume(); - expectToken("=", pp, source); + expectToken(":=", pp, source); final int valueStart = pp.getIndex(); final UnicodeSet valueSet = parseUnicodeSet(source, pp); valueSet.complement().complement(); @@ -1454,6 +1500,7 @@ private static int parseError( final int eol = source.indexOf("\n", statementStart); source = source.substring(sol >= 0 ? sol : 0, eol >= 0 ? eol : source.length()); } + source = source.trim(); printErrorLine("Parse Failure", Side.START, parseErrorCount); println("**** PARSE ERROR:\t" + source); @@ -1506,6 +1553,7 @@ private static void printErrorLine(String title, Side side, int testFailureCount private static final UnicodeSet PATTERN_WHITE_SPACE = new UnicodeSet("\\p{pattern white space}").freeze(); private static final UnicodeSet PATTERN_SYNTAX = new UnicodeSet("\\p{pattern syntax}").freeze(); + private static final UnicodeSet NONSPACING_MARK = new UnicodeSet("\\p{Mn}").freeze(); private static final UnicodeSet PATTERN_SYNTAX_OR_WHITE_SPACE = new UnicodeSet("[\\p{pattern white space}\\p{pattern syntax}]").freeze(); diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/SecurityInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/SecurityInvariantTest.txt index 838516704..da80bd532 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/SecurityInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/SecurityInvariantTest.txt @@ -18,7 +18,7 @@ \p{Identifier_Type=Not_NFKC} = [\p{NFKC_QC=No}-\p{Deprecated}-\p{Default_Ignorable_Code_Point}] -Let $Strongly_Restricted = [\p{Identifier_Type=Not_Character}\p{Identifier_Type=Deprecated}\p{Identifier_Type=Default_Ignorable}\p{Identifier_Type=Not_NFKC}] +Let $Strongly_Restricted := [\p{Identifier_Type=Not_Character}\p{Identifier_Type=Deprecated}\p{Identifier_Type=Default_Ignorable}\p{Identifier_Type=Not_NFKC}] \p{Identifier_Type=Not_XID} = [\P{XID_Continue}-$Strongly_Restricted-\p{Identifier_Type=Inclusion}] diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt index 9f684144f..fd7be0d42 100644 --- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt +++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt @@ -164,12 +164,12 @@ # General Constants ########################## -Let $foo = \p{ccc=9} -Let $fii = \p{toNFD=/$foo/} +Let $foo := \p{ccc=9} +Let $fii := \p{toNFD=/$foo/} -Let $codepoints = [\u0000-\U0010FFFF] +Let $codepoints := [\u0000-\U0010FFFF] -Let $gcAllPunctuation = \p{gc=/_Punctuation/} +Let $gcAllPunctuation := \p{gc=/_Punctuation/} $gcAllPunctuation = [ \p{gc=Close_Punctuation} \p{gc=Connector_Punctuation} @@ -180,13 +180,13 @@ $gcAllPunctuation = [ \p{gc=Other_Punctuation} ] -Let $gcAllSymbols = \p{gc=/_Symbol/} +Let $gcAllSymbols := \p{gc=/_Symbol/} $gcAllSymbols = [\p{gc=Math_Symbol}\p{gc=Currency_Symbol}\p{gc=Modifier_Symbol}\p{gc=Other_Symbol}] -Let $gcAllMarks = \p{gc=/_Mark/} +Let $gcAllMarks := \p{gc=/_Mark/} $gcAllMarks = [\p{gc=Nonspacing_Mark}\p{gc=Enclosing_Mark}\p{gc=Spacing_Mark}] -Let $gcAllLetters = \p{gc=/_Letter/} +Let $gcAllLetters := \p{gc=/_Letter/} $gcAllLetters = [\p{gc=Ll}\p{gc=Lu}\p{gc=Lo}\p{gc=Lt}\p{gc=Lm}] ########################## @@ -273,11 +273,11 @@ In \p{dt=canonical}, (delete-adjacent-duplicates) * bc * \P{bc=NSM} * dm = bc * # Stability: The property values for the bidirectional properties Bidi_Class and Bidi_Mirrored preserve canonical equivalence. # This test utilizes the fact that bc=NSM inherit behavior in the algorithm, so these are just filtered # There are 6 special cases, all symmetric symbols—which are not mirrored—with a solidus overlay: -Let $BMExclusions = [ ≠ ∤ ∦ ≢ ≭ ⫝̸ ] +Let $BMExclusions := [ ≠ ∤ ∦ ≢ ≭ ⫝̸ ] In [\p{dt=canonical}-$BMExclusions], (delete-adjacent-duplicates) * Bidi_M * \P{bc=NSM} * dm = Bidi_M * \P{bc=NSM} # Additional BIDI invariant constants -Let $AL_blocks = [ +Let $AL_blocks := [ \u0600-\u07BF \u0860-\u086F # Syriac Supplement, 10.0 \u0870-\u089F # Arabic Extended-B, 14.0 @@ -292,7 +292,7 @@ Let $AL_blocks = [ \U0001ED00-\U0001ED4F # Ottoman Siyaq Numbers, 12.0 \U0001EE00-\U0001EEFF ] -Let $R_blocks = [ +Let $R_blocks := [ \u0590-\u05FF \u07C0-\u085F \uFB1D-\uFB4F @@ -319,7 +319,7 @@ $AL_blocks ∥ [\p{Bidi_Class=L} \p{Bidi_Class=R}] # U6.0: BN characters are default ignorable, noncharacters, controls, minus marks, bidi-controls, alphabetic, whitespace, with a few exceptions -Let $BN_Exceptions = [\u001C-\u001F\u17B4\u17B5] +Let $BN_Exceptions := [\u001C-\u001F\u17B4\u17B5] [\p{Bidi_Class=BN}] = [ \p{di}\p{nchar}\p{gc=Cc} @@ -331,7 +331,7 @@ Let $BN_Exceptions = [\u001C-\u001F\u17B4\u17B5] ] # Nonspacing and enclosing combining marks are bc=NSM, with a few exceptions (all of which are nonspacing) -Let $gcMn_bcL = [\u0CBF\u0CC6\U00011A07\U00011A08\U00011C3F] +Let $gcMn_bcL := [\u0CBF\u0CC6\U00011A07\U00011A08\U00011C3F] \p{bc=NSM} = [\p{gc=Mn} \p{gc=Me} - $gcMn_bcL] ########################## @@ -441,8 +441,8 @@ In \P{U-1:GC=Cn}, Jamo_Short_Name=U-1:Jamo_Short_Name # NOTE: If this invariant needs to change, the section on migrating from the # Unicode 3.0 identifier definition in UTS #55 must be updated: # https://www.unicode.org/reports/tr55/#Evolution-Unicode-3. -Let $identifier_start = [\p{GC=Lu}\p{GC=Ll}\p{GC=Lt}\p{GC=Lm}\p{GC=Lo}\p{GC=Nl}] -Let $identifier_extend = [\p{GC=Mn}\p{GC=Mc}\p{GC=Nd}\p{GC=Pc}] +Let $identifier_start := [\p{GC=Lu}\p{GC=Ll}\p{GC=Lt}\p{GC=Lm}\p{GC=Lo}\p{GC=Nl}] +Let $identifier_extend := [\p{GC=Mn}\p{GC=Mc}\p{GC=Nd}\p{GC=Pc}] [\p{ID_Start}\p{Name=VERTICAL TILDE}] ⊇ $identifier_start [\p{ID_Continue}\p{Name=VERTICAL TILDE}] ⊇ [$identifier_start $identifier_extend] @@ -461,7 +461,7 @@ In \P{U-1:GC=Cn}, ccc=U-1:ccc # Canonical decompositions (minus exclusions) must be identical across releases (also required by strong normalization stability), # except where a character and at least one character in its decomposition are both new in the release. -Let $New_Decompositions = [ +Let $New_Decompositions := [ [ \p{Decomposition_Type=Canonical} - \p{Full_Composition_Exclusion}] - [\p{U-1:Decomposition_Type=Canonical} - \p{U-1:Full_Composition_Exclusion}] ] @@ -470,7 +470,7 @@ $New_Decompositions ⊆ \p{U-1:GC=Cn} # something, that is, the decomposition contains newly-assigned characters. In $New_Decompositions, toNFD * \P{U-1:GC=Cn} ≠ toNFD -Let $Unicode_13_Decompositions = [ +Let $Unicode_13_Decompositions := [ [\p{U13.0.0:Decomposition_Type=Canonical} - \p{U13.0.0:Full_Composition_Exclusion}] - [\p{U12.1.0:Decomposition_Type=Canonical} - \p{U12.1.0:Full_Composition_Exclusion}] ] @@ -479,7 +479,7 @@ In $Unicode_13_Decompositions, toNFD * \P{U12.1.0:GC=Cn} ≠ toNFD $Unicode_13_Decompositions = [\U00011938] $Unicode_13_Decompositions = [\p{Name=DIVES AKURU VOWEL SIGN O}] -Let $Unicode_7_Decompositions = [ +Let $Unicode_7_Decompositions := [ [\p{U7.0.0:Decomposition_Type=Canonical} - \p{U7.0.0:Full_Composition_Exclusion}] - [\p{U6.3.0:Decomposition_Type=Canonical} - \p{U6.3.0:Full_Composition_Exclusion}] ] @@ -488,7 +488,7 @@ In $Unicode_7_Decompositions, toNFD * \P{U6.3.0:GC=Cn} ≠ toNFD $Unicode_7_Decompositions = [\U0001134B-\U0001134C \U000114BB-\U000114BC \U000114BE \U000115BA-\U000115BB] $Unicode_7_Decompositions ⊆ [\p{Name=/^(GRANTHA|TIRHUTA|SIDDHAM) VOWEL SIGN /}] -Let $Unicode_6_1_Decompositions = [ +Let $Unicode_6_1_Decompositions := [ [\p{U6.1.0:Decomposition_Type=Canonical} - \p{U6.1.0:Full_Composition_Exclusion}] - [\p{U6.0.0:Decomposition_Type=Canonical} - \p{U6.0.0:Full_Composition_Exclusion}] ] @@ -512,7 +512,7 @@ In $codepoints, Decomposition_Mapping * Decomposition_Mapping * Decomposition_Ma # the pair cannot itself have a canonical mapping. [ \p{Decomposition_Type=Canonical} & \p{Decomposition_Mapping=/.../} ] = [] [ \p{Decomposition_Type=Canonical} & \p{Decomposition_Mapping=} ] = [] -Let $expandingCanonicalDecompositions = [ \p{Decomposition_Type=Canonical} & \p{Decomposition_Mapping=/../} ] +Let $expandingCanonicalDecompositions := [ \p{Decomposition_Type=Canonical} & \p{Decomposition_Mapping=/../} ] In $expandingCanonicalDecompositions, Decomposition_Type * (drop 1) * Decomposition_Mapping ≠ (constant Canonical) # Not a stability policy, but it happens to be the case that the second # character does not have a decomposition mapping at all: @@ -537,17 +537,17 @@ In \P{U-1:GC=Cn}, dm=U-1:dm # pair of characters are limited such that the first of the pair in the mapping # must have ccc=0, except for the Decomposition_Mapping of the following four # characters: U+0344, U+0F73, U+0F75, U+0F81. -Let $canonicallyExpandingNonstarters = [\u0344 \u0F73 \u0F75 \u0F81] +Let $canonicallyExpandingNonstarters := [\u0344 \u0F73 \u0F75 \u0F81] In [$expandingCanonicalDecompositions - $canonicallyExpandingNonstarters], ccc * (take 1) * Decomposition_Mapping = (constant Not_Reordered) # U6.0: Construction of Full_Composition_Exclusion # Primary Composites don't include singletons, ccc!=0, or sequences starting with ccc!=0 -Let $combiningExclusions = [\p{dt=canonical}-\P{nfcqc=N}-\P{nfdqc=N}] +Let $combiningExclusions := [\p{dt=canonical}-\P{nfcqc=N}-\P{nfdqc=N}] -Let $singletons = \p{toNFD=/^.$/} -Let $nonstarter = \P{ccc=0} -Let $firstNonStarter = \p{toNFD=/^$nonstarter/} +Let $singletons := \p{toNFD=/^.$/} +Let $nonstarter := \P{ccc=0} +Let $firstNonStarter := \p{toNFD=/^$nonstarter/} $combiningExclusions ⊇ [$singletons & \p{dt=canonical}] $combiningExclusions ⊇ [$nonstarter & \p{dt=canonical}] @@ -583,13 +583,13 @@ $combiningExclusions ⊇ [$firstNonStarter & \p{dt=canonical}] # Decimals are 0-9 -Let $decimalValue = [\p{Numeric_Value=/^[0-9]+(.0)?$/}] +Let $decimalValue := [\p{Numeric_Value=/^[0-9]+(.0)?$/}] $decimalValue ⊇ \p{General_Category=Decimal_Number} # All and only those items with numeric types have numeric values -Let $anyNumericValue = \p{Numeric_Value=/-?[0-9]+(.[0-9]+)?/} +Let $anyNumericValue := \p{Numeric_Value=/-?[0-9]+(.[0-9]+)?/} [\p{Numeric_Type=Decimal} \p{Numeric_Type=Digit} \p{Numeric_Type=Numeric}] = $anyNumericValue ########################## @@ -598,7 +598,7 @@ Let $anyNumericValue = \p{Numeric_Value=/-?[0-9]+(.[0-9]+)?/} # Musical symbol combining marks, other oddities -Let $AlphaExclusions = [[\uAA7D \u0F3E\u0F3F\u1063\u1064\u1069-\u106D\u1087-\u108C\u108F\u109A\u109B\u1CE1\u1CF7\uAA7B\uABEC\U0001D165\U0001D166\U0001D16D-\U0001D172][[:gc=mc:]&[:ccc=9:][\u302E\u302F]]] +Let $AlphaExclusions := [[\uAA7D \u0F3E\u0F3F\u1063\u1064\u1069-\u106D\u1087-\u108C\u108F\u109A\u109B\u1CE1\u1CF7\uAA7B\uABEC\U0001D165\U0001D166\U0001D16D-\U0001D172][[:gc=mc:]&[:ccc=9:][\u302E\u302F]]] # 6.1.0 Added HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK # 7.0 Added AA7D # 10.0 Added 1CF7 (similar to 1CE1) @@ -632,10 +632,10 @@ Show [\u20b9] # The check below enforces that rule, but allows for possible future # exceptions. Should such exceptions arise, they can be added to the definition of # $nonAlphabeticBindus to avoid a failure on this test. -Let $nonAlphabeticBindus = [] +Let $nonAlphabeticBindus := [] [\p{InSc=Bindu} - \p{Alphabetic}] = $nonAlphabeticBindus -Let $nonAlphabeticDependentVowels = [ +Let $nonAlphabeticDependentVowels := [ \N{ORIYA SIGN OVERLINE} \N{THAI CHARACTER MAITAIKHU} \N{LIMBU SIGN KEMPHRENG} @@ -649,16 +649,16 @@ Let $nonAlphabeticDependentVowels = [ \p{InSC=Nukta} ⊆ \p{Diacritic} [\p{InSC=Virama}\p{InSC=Pure_Killer}\p{InSC=Reordering_Killer}] ⊆ \p{Diacritic} \p{InSC=Invisible_Stacker} ⊆ \p{Diacritic} -Let $nonAlphabeticAvagrahas = [\N{TIBETAN MARK PALUTA}] # A punctuation mark. +Let $nonAlphabeticAvagrahas := [\N{TIBETAN MARK PALUTA}] # A punctuation mark. [\p{InSC=Avagraha} - $nonAlphabeticAvagrahas] ⊆ \p{Alphabetic} # Name-based checks. -Let $nonLowercaseSmallLetters = [ +Let $nonLowercaseSmallLetters := [ \p{name=/^LIMBU SMALL LETTER/} \N{TURNED GREEK SMALL LETTER IOTA} \p{name=/^(SQUARED|PARENTHESIZED|TAG) LATIN SMALL LETTER/} ] -Let $nonLowercaseSmallModifierLetters = [ \p{gc=Lm} & \p{name=/^ARABIC SMALL/} ] +Let $nonLowercaseSmallModifierLetters := [ \p{gc=Lm} & \p{name=/^ARABIC SMALL/} ] [ \p{name=/\bSMALL LETTER\b/}-\p{gc=Mn}-\p{gc=Lt} - $nonLowercaseSmallLetters ] ⊆ \p{Lowercase} [ [\p{gc=Lm} & \p{name=/SMALL/}] - $nonLowercaseSmallModifierLetters ] ⊆ \p{Lowercase} @@ -694,7 +694,7 @@ In \P{Other_Joining_Type=Deduce_From_General_Category}, Joining_Type = Other_Joi # LineBreak property ########################## -Let $IDInclusions = [ +Let $IDInclusions := [ [:block=/Ideographs/:] [ # Some ranges default to lb=ID even outside of any blocks: [\U00020000-\U0003FFFF] # Planes 2 and 3, lb=ID since 5.2, 115-C27. @@ -712,7 +712,7 @@ Let $IDInclusions = [ - [\u20C0-\u20CF] # Unassigned currency symbols are lb=PR since 6.3, 133-C26. ] -Let $BrahmicLineBreaking = [ +Let $BrahmicLineBreaking := [ \p{sc=Balinese} \p{sc=Batak} \p{sc=Brahmi} @@ -727,9 +727,9 @@ Let $BrahmicLineBreaking = [ \p{sc=Tulu_Tigalari} \p{sc=Gurung_Khema} ] -Let $VFScripts = [\p{sc=Batak}] +Let $VFScripts := [\p{sc=Batak}] -Let $OPInclusions = [\u00A1\u00BF\u2E18\U00013258-\U0001325A\U00013286\U00013288\U00013379\U0001342F\U00013437\U0001343C\U0001343E\U000145CE\U0001E95E-\U0001E95F] +Let $OPInclusions := [\u00A1\u00BF\u2E18\U00013258-\U0001325A\U00013286\U00013288\U00013379\U0001342F\U00013437\U0001343C\U0001343E\U000145CE\U0001E95E-\U0001E95F] # 7.0 Removed hack - [\u2308\u230A] # 9.0 Added Adlam initial punctuation 1E95E..1E95F # 12.0 Added Egyptian control begin segment 13437 (gc=Cf, lb=OP) @@ -759,8 +759,8 @@ Let $OPInclusions = [\u00A1\u00BF\u2E18\U00013258-\U0001325A\U00013286\U00013288 # 15.1.0: and those from scripts that use the Brahmic style of context analysis, # 4.0.1: plus these characters: 066B ARABIC DECIMAL SEPARATOR, 066C ARABIC THOUSANDS SEPARATOR. # 15.1.0: Action item UTC-176-A81: change [[:PCM:]-\u070F] lb=AL->NU -Let $NUInclusions = [\u066B\u066C] -Let $NUFormats = [[:PCM:]-[\u070F]] +Let $NUInclusions := [\u066B\u066C] +Let $NUFormats := [[:PCM:]-[\u070F]] \p{LB=NU} = [\p{GC=Nd} $NUInclusions $NUFormats - \p{EA=F} - $BrahmicLineBreaking] # Digits are lb=AS in scripts with brahmic line breaking. @@ -770,10 +770,10 @@ Let $NUFormats = [[:PCM:]-[\u070F]] # of the rule (AK | ◌ | AS) × (AK | ◌ | AS) VF. There are no Batak digits. [\p{GC=Nd} & $VFScripts] = [] -Let $PRInclusions = [\u002b\u005c\u00b1\u2116\u2212\u2213\u20C0-\u20CF] +Let $PRInclusions := [\u002b\u005c\u00b1\u2116\u2212\u2213\u20C0-\u20CF] \p{LB=PR} = [\p{GC=Sc} $PRInclusions - \p{LB=PO}] -Let $QUInclusions = [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\u275E \u2E00-\u2E01 \u2E06-\u2E08 \u2E0B] +Let $QUInclusions := [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\u275E \u2E00-\u2E01 \u2E06-\u2E08 \u2E0B] # 7.0 added 275F..2760 1F676..1F678 \p{LB=QU} = [\p{GC=Pf} \p{GC=Pi} $QUInclusions] \p{LB=SG} = \p{GC=Cs} @@ -817,7 +817,7 @@ Let $QUInclusions = [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\u # covered by adding them to the exception set $SAScriptExceptions for the test. # SA are limited to certain scripts: -Let $SAScripts = [ +Let $SAScripts := [ \p{script=ahom} \p{script=thai} \p{script=lao} @@ -831,7 +831,7 @@ Let $SAScripts = [ $SAScripts ⊇ \p{LineBreak=SA} # And in $SA scripts, they are all the alphabetic spacing characters, plus some odd Cf & Mn, plus the NEW TAI LUE THAM DIGIT ONE -Let $SAScriptExceptions = [\x{1173A}\x{1173B}\x{1173F} \u19DA\u109E\u109F\u19DE\u19DF\u1AA0-\u1AA6\u1AA8-\u1AAD\uAA77-\uAA79\uAADE-\uAADF] +Let $SAScriptExceptions := [\x{1173A}\x{1173B}\x{1173F} \u19DA\u109E\u109F\u19DE\u19DF\u1AA0-\u1AA6\u1AA8-\u1AAD\uAA77-\uAA79\uAADE-\uAADF] # 7.0 Added AA7D # 12.0 Removed Myanmar spacing marks which were assigned Other_Alphabetic: 1063..1064, 1069..106D, 1087..108C, 108F, 109A..109B, AA7B, AA7D @@ -849,10 +849,10 @@ $SAScriptExceptions ∥ [\p{Alphabetic} \p{gc=cf} \p{gc=Mn}] # UAX 31 Table 3: Candidate Characters for Inclusion in Identifiers # Warning: the uax31 tables don't have machine-readable tables, so must be updated each release. -Let $uax31table3 = [\u0027\u002D\u002E\u003A\u00B7\u058A\u05F3\u05F4\u0F0B\u200C\u200D\u2010\u2019\u2027\u30A0\u30FB] +Let $uax31table3 := [\u0027\u002D\u002E\u003A\u00B7\u058A\u05F3\u05F4\u0F0B\u200C\u200D\u2010\u2019\u2027\u30A0\u30FB] -Let $WBRemovals = [\u0387\u0604\u2018\u2024\u202F\u2E2F\uFE13\uFE52\uFE55\uFF07\uFF0E\uFF1A\u200c\u200d'.\:\u00AD\u00B7\u05F3\u05F4\u0600-\u0603\u06DD\u070F\u17B4\u17B5\u200E\u200F\u2019\u2027\u202A-\u202E\u2060-\u2064\u206A-\u206F\uFEFF\uFFF9-\uFFFB\u02C2-\u02C5\u02D2-\u02D6\u02DE-\u02DF\u02ED\u02EF-\u02FF\uA720-\uA721\uA789-\uA78A\uAB5B\u055B\u055C\u055E\U000110BD\U0001D173-\U0001D17A\U000E0001\U000E0020-\U000E007F\p{gc=Cf}\p{Block=Enclosed Alphanumerics}[\u02D7\u0605\u061C\u180E\u2066-\u2069\U0001BCA0-\U0001BCA3\U0001F130-\U0001F149\U0001F150-\U0001F169\U0001F170-\U0001F189\U0001F3FB-\U0001F3FF]] -Let $WBRemovals13 = [\u02E5-\u02EB\u055A\u058A\uA708-\uA716\u055F] +Let $WBRemovals := [\u0387\u0604\u2018\u2024\u202F\u2E2F\uFE13\uFE52\uFE55\uFF07\uFF0E\uFF1A\u200c\u200d'.\:\u00AD\u00B7\u05F3\u05F4\u0600-\u0603\u06DD\u070F\u17B4\u17B5\u200E\u200F\u2019\u2027\u202A-\u202E\u2060-\u2064\u206A-\u206F\uFEFF\uFFF9-\uFFFB\u02C2-\u02C5\u02D2-\u02D6\u02DE-\u02DF\u02ED\u02EF-\u02FF\uA720-\uA721\uA789-\uA78A\uAB5B\u055B\u055C\u055E\U000110BD\U0001D173-\U0001D17A\U000E0001\U000E0020-\U000E007F\p{gc=Cf}\p{Block=Enclosed Alphanumerics}[\u02D7\u0605\u061C\u180E\u2066-\u2069\U0001BCA0-\U0001BCA3\U0001F130-\U0001F149\U0001F150-\U0001F169\U0001F170-\U0001F189\U0001F3FB-\U0001F3FF]] +Let $WBRemovals13 := [\u02E5-\u02EB\u055A\u058A\uA708-\uA716\u055F] # 9.0 corrected \p{gc=Cf} and added 202F # 10.0 added 34 characters which were absorbed into WB=LE (see http://www.unicode.org/reports/tr29/tr29-30d2.html#ALetter) # 11.0 added 5 skin tone modifiers which were absorbed into WB=Extend @@ -860,7 +860,7 @@ Let $WBRemovals13 = [\u02E5-\u02EB\u055A\u058A\uA708-\uA716\u055F] # 13.0 added 24 characters (mostly tone modifiers) which were absorbed into WB=LE (see http://www.unicode.org/reports/tr29/tr29-36.html#ALetter) # 13.0 added 055F which was absorbed into WB=ML (see http://www.unicode.org/reports/tr29/tr29-36.html#MidLetter) -Let $Uax31Removals = [\-\u058A\u0F0B\u2010\u30A0\u30FB\u2E2F\u17B4-\u17B5] +Let $Uax31Removals := [\-\u058A\u0F0B\u2010\u30A0\u30FB\u2E2F\u17B4-\u17B5] # 6.1.0 adjust SAMVAT, KHMER VOWEL INHERENT* [\p{Alpha}\p{WB=Extend}\p{WB=FO}\p{WB=LE}\p{WB=ML}\p{WB=MB}\p{WB=EX}-$WBRemovals-$WBRemovals13] = [$gcAllLetters $gcAllMarks \p{gc=Nl}\p{gc=Pc}-$Uax31Removals] @@ -886,8 +886,8 @@ Let $Uax31Removals = [\-\u058A\u0F0B\u2010\u30A0\u30FB\u2E2F\u17B4-\u17B5] \p{ID_Continue} = [\p{Other_ID_Continue} \p{ID_Start} \p{GC=Mn} \p{GC=Mc} \p{GC=Nd} \p{GC=Pc} - \p{Pattern_Syntax} - \p{Pattern_White_Space}] # See derivation of Default_Ignorable_Code_Point in DerivedCoreProperties.txt -Let $Annotations = [\uFFF9-\uFFFB] -Let $EgyptianControls = [\U00013430-\U00013440] +Let $Annotations := [\uFFF9-\uFFFB] +Let $EgyptianControls := [\U00013430-\U00013440] \p{Default_Ignorable_Code_Point} = [\p{Other_Default_Ignorable_Code_Point} \p{GC=Cf} \p{Variation_Selector} - [\p{White_Space} $Annotations $EgyptianControls \p{PCM}]] \p{Grapheme_Extend} = [\p{Other_Grapheme_Extend} \p{GC=Me} \p{GC=Mn}] @@ -926,9 +926,9 @@ Let $EgyptianControls = [\U00013430-\U00013440] # either adjust the set of scripts, or # - change ToolUnicodePropertySource.java to remove the offending characters from the gcbSpacingMarkSet # - add them to the exceptions list in UAX #29 -Let $PostBaseSpacingMarks_All = [[:sc=Myanmar:][:sc=Tai_Tham:][:sc=Ahom:] & [:gc=Mc:] & [:InPC=Right:]] -Let $PostBaseSpacingMarks_Tweak = [\u103B \u1056 \u1057 \u1A57 \u1A6D] -Let $PostBaseSpacingMarks_Missed = [] +Let $PostBaseSpacingMarks_All := [[:sc=Myanmar:][:sc=Tai_Tham:][:sc=Ahom:] & [:gc=Mc:] & [:InPC=Right:]] +Let $PostBaseSpacingMarks_Tweak := [\u103B \u1056 \u1057 \u1A57 \u1A6D] +Let $PostBaseSpacingMarks_Missed := [] [$PostBaseSpacingMarks_All - $PostBaseSpacingMarks_Tweak - $PostBaseSpacingMarks_Missed] ⊂ [:GCB=XX:] # Check the consistency of grapheme cluster segmentation (both legacy and @@ -944,9 +944,9 @@ Let $PostBaseSpacingMarks_Missed = [] # But not before Unicode Version 16.0, even though we were saying so since # Unicode Version 4.0 (https://www.unicode.org/reports/tr29/tr29-4.html#Implementation_Notes), # oops (see L2/24-009). -Let $TwoForgottenMusicalSymbols = \p{Name=/^MUSICAL SYMBOL COMBINING (SPRECHGESANG STEM|AUGMENTATION DOT)$/} -Let $FourteenSpacingViramas = [\p{U15.1.0:ccc=9}&\p{U15.1.0:gc=Mc}] -Let $TwoVietnameseReadingMarks = [\p{U15.1.0:ccc=6}] +Let $TwoForgottenMusicalSymbols := \p{Name=/^MUSICAL SYMBOL COMBINING (SPRECHGESANG STEM|AUGMENTATION DOT)$/} +Let $FourteenSpacingViramas := [\p{U15.1.0:ccc=9}&\p{U15.1.0:gc=Mc}] +Let $TwoVietnameseReadingMarks := [\p{U15.1.0:ccc=6}] [\P{U4.0.0:ccc=0} - \p{U4.0.0:Grapheme_Extend}] = [$TwoForgottenMusicalSymbols \p{Name=/^MUSICAL SYMBOL COMBINING FLAG-[3-5]$/}] [\P{U4.1.0:ccc=0} - \p{U4.1.0:GCB=Extend}] = $TwoForgottenMusicalSymbols @@ -993,7 +993,7 @@ Let $TwoVietnameseReadingMarks = [\p{U15.1.0:ccc=6}] \p{Emoji_Modifier} ⊂ \p{Emoji_Presentation} \p{Emoji_Presentation} ⊂ \p{Emoji} -Let $HairComponents = [\U0001F9B0-\U0001F9B3] +Let $HairComponents := [\U0001F9B0-\U0001F9B3] [\p{Extended_Pictographic} & \p{Emoji_Component}] = $HairComponents \p{Extended_Pictographic} ⊃ [\p{Emoji} - \p{Emoji_Component}] \p{Extended_Pictographic} ⊃ [\p{Emoji_Presentation} - \p{Emoji_Component}] @@ -1031,17 +1031,17 @@ Let $HairComponents = [\U0001F9B0-\U0001F9B3] # constants -Let $SP = [\u0020] # [\N{space}] -Let $TAB = [\u0009] # [\N{CHARACTER TABULATION}] -Let $LF = [\u000A] # \N{linefeed} -Let $VTAB = [\u000B] # [\N{LINE TABULATION}] -Let $FF = [\u000C] # [\N{formfeed}] -Let $CR = [\u000D] # \N{carriage return} -Let $NEL = [\u0085] # \N{next line} +Let $SP := [\u0020] # [\N{space}] +Let $TAB := [\u0009] # [\N{CHARACTER TABULATION}] +Let $LF := [\u000A] # \N{linefeed} +Let $VTAB := [\u000B] # [\N{LINE TABULATION}] +Let $FF := [\u000C] # [\N{formfeed}] +Let $CR := [\u000D] # \N{carriage return} +Let $NEL := [\u0085] # \N{next line} #Let $ZWNJ = [\u200C] # [\N{ZERO WIDTH NON-JOINER}] #Let $ZWJ = [\u200D] # [\N{ZERO WIDTH JOINER}] -Let $CircledAsciiLetters = [\u24B6-\u24E9] +Let $CircledAsciiLetters := [\u24B6-\u24E9] # Unassigned, Control, Format, Private_Use, Surrogate, # Uppercase_Letter, Lowercase_Letter, Titlecase_Letter, Modifier_Letter, Other_Letter, @@ -1053,19 +1053,19 @@ Let $CircledAsciiLetters = [\u24B6-\u24E9] # UTS Rules -Let $alpha = [\p{Alphabetic} $CircledAsciiLetters] -Let $lower = \p{Lowercase} -Let $upper = [\p{Uppercase}] -Let $punct = [$gcAllPunctuation $gcAllSymbols - $alpha] -Let $digit = \p{gc=Decimal_Number} -Let $xdigit = [\p{gc=Decimal_Number} \p{Hex_Digit}] # in both! -Let $alnum = [$alpha $digit] -Let $space = \p{Whitespace} -Let $blank = [\p{Whitespace} - [$LF $VTAB $FF $CR $NEL \p{gc=Line_Separator} \p{gc=Paragraph_Separator}]] -Let $cntrl = \p{gc=Control} -Let $graph = [^$space \p{gc=Control} \p{gc=Surrogate} \p{gc=Unassigned}] -Let $print = [$graph $blank - $cntrl] -Let $word = [$alpha $gcAllMarks $digit \p{gc=Connector_Punctuation}] +Let $alpha := [\p{Alphabetic} $CircledAsciiLetters] +Let $lower := \p{Lowercase} +Let $upper := [\p{Uppercase}] +Let $punct := [$gcAllPunctuation $gcAllSymbols - $alpha] +Let $digit := \p{gc=Decimal_Number} +Let $xdigit := [\p{gc=Decimal_Number} \p{Hex_Digit}] # in both! +Let $alnum := [$alpha $digit] +Let $space := \p{Whitespace} +Let $blank := [\p{Whitespace} - [$LF $VTAB $FF $CR $NEL \p{gc=Line_Separator} \p{gc=Paragraph_Separator}]] +Let $cntrl := \p{gc=Control} +Let $graph := [^$space \p{gc=Control} \p{gc=Surrogate} \p{gc=Unassigned}] +Let $print := [$graph $blank - $cntrl] +Let $word := [$alpha $gcAllMarks $digit \p{gc=Connector_Punctuation}] # =========================== @@ -1104,7 +1104,7 @@ $blank ⊇ [$SP $TAB] # Extra POSIX 'POSIX locale' constraints -Let $C0Controls = [\u0000-\u001F] +Let $C0Controls := [\u0000-\u001F] $cntrl ⊇ $C0Controls @@ -1136,10 +1136,10 @@ $punct ⊇ [[\u0021-\u007E] - [0-9 A-Z a-z]] # The Khitan Small Script filler is a Nonspacing Mark. # The other characters are numerals (the Hangzhou ten through thirty are compatibility decomposable, # but not the one through nine) and have Script=Han. -Let $NonOtherLetterIdeographs = [\N{KHITAN SMALL SCRIPT FILLER} 〇 〡-〩 〸-〺] +Let $NonOtherLetterIdeographs := [\N{KHITAN SMALL SCRIPT FILLER} 〇 〡-〩 〸-〺] $NonOtherLetterIdeographs = [\p{Ideographic} - \p{gc=Lo}] # Ideographic closing mark, gc=Lo. -Let $CommonIdeographs = [〆] +Let $CommonIdeographs := [〆] $CommonIdeographs = [\p{Ideographic} & \p{sc=Common}] \p{Ideographic} = [ @@ -1156,7 +1156,7 @@ $CommonIdeographs = [\p{Ideographic} & \p{sc=Common}] [ [\p{Ideographic}&\p{sc=Han}] - \p{nfkcqc=n} - $NonOtherLetterIdeographs ] = \p{Unified_Ideograph} -Let $unihanScope = [\p{Block=/^CJK.(Unified|Compatibility).Ideographs/} - \p{gc=Cn}] +Let $unihanScope := [\p{Block=/^CJK.(Unified|Compatibility).Ideographs/} - \p{gc=Cn}] \p{Unified_Ideograph} ⊂ $unihanScope $unihanScope = [\p{gc=Lo} & \p{sc=Hani}] $unihanScope = \P{kRSUnicode=@none@} @@ -1178,17 +1178,17 @@ $unihanScope = [ # TODO(eggrobin): Should those two have a kMandarin, or this not actually an invariant? # See https://www.unicode.org/review/pri483/feedback.html#ID20240118004124. # Change to \P{kHanyuPinyin=@none@} ⊆ \P{kMandarin=@none@} once this gets fixed. -Let $ideographsMissingkMandarin = [\x{228F5}\x{2574C}] +Let $ideographsMissingkMandarin := [\x{228F5}\x{2574C}] [\P{kHanyuPinyin=@none@} - \P{kMandarin=@none@}] = $ideographsMissingkMandarin -Let $cjkStrokes = \p{Name=/^CJK STROKE /} -Let $kangxiRadicals = \p{Name=/^KANGXI RADICAL /} -Let $cjkRadicals = \p{Name=/^CJK RADICAL /} -Let $strokesAndRadicals = [ $cjkStrokes $kangxiRadicals $cjkRadicals ] -Let $nonIdeographicRadicals = \N{CJK RADICAL REPEAT} +Let $cjkStrokes := \p{Name=/^CJK STROKE /} +Let $kangxiRadicals := \p{Name=/^KANGXI RADICAL /} +Let $cjkRadicals := \p{Name=/^CJK RADICAL /} +Let $strokesAndRadicals := [ $cjkStrokes $kangxiRadicals $cjkRadicals ] +Let $nonIdeographicRadicals := \N{CJK RADICAL REPEAT} # The following set may expand over time, if strokes are added. # It can also shrink, if single-stroke ideographs are encoded. -Let $nonIdeographicStrokes = \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZWG|HZT|HZZP|HPWG|HZW|HZZZ|PG|Q|HXG|SZP)$/} +Let $nonIdeographicStrokes := \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZWG|HZT|HZZP|HPWG|HZW|HZZZ|PG|Q|HXG|SZP)$/} # See https://www.unicode.org/review/pri502/feedback.html#ID20240523095709. $cjkStrokes ⊆ \p{scx=Hani} @@ -1209,31 +1209,31 @@ In $kangxiRadicals, Equivalent_Unified_Ideograph ∈ \p{kRSUnicode=/\.0/} # non-Chinese simplified radical (kRSUnicode=n''.0) otherwise. # However, two of the simplified radicals are unifiable with their non-simplified counterparts, # and are therefore equivalent to ideographs with kRSUnicode=n.0. -Let $radicalsWithUnifiableSimplifications = [角辶] +Let $radicalsWithUnifiableSimplifications := [角辶] $radicalsWithUnifiableSimplifications ⊆ \p{kRSUnicode=/^[0-9]+\.0$/} [$radicalsWithUnifiableSimplifications & \p{kRSUnicode=/^[0-9]+'\.0$/}] = [] -Let $chineseSimplifiedRadicals = \p{Name=/CJK RADICAL (C-)?SIMPLIFIED/} -Let $japaneseSimplifiedRadicals = \p{Name=/CJK RADICAL J-SIMPLIFIED/} +Let $chineseSimplifiedRadicals := \p{Name=/CJK RADICAL (C-)?SIMPLIFIED/} +Let $japaneseSimplifiedRadicals := \p{Name=/CJK RADICAL J-SIMPLIFIED/} In $chineseSimplifiedRadicals, Equivalent_Unified_Ideograph ∈ [\p{kRSUnicode=/^[0-9]+'\.0$/} $radicalsWithUnifiableSimplifications] In $japaneseSimplifiedRadicals, Equivalent_Unified_Ideograph ∈ \p{kRSUnicode=/^[0-9]+''\.0$/} # Tangut invariants -Let $tangutSourcesScope = [\p{Block=/^Tangut(.Supplement)?$/} - \p{gc=Cn}] +Let $tangutSourcesScope := [\p{Block=/^Tangut(.Supplement)?$/} - \p{gc=Cn}] $tangutSourcesScope = [ [\p{gc=Lo} & \p{sc=Tangut}] - \p{name=/^TANGUT COMPONENT-/} ] $tangutSourcesScope = \P{kTGT_MergedSrc=@none@} $tangutSourcesScope = \P{kRSTUnicode=@none@} # Nüshu invariants -Let $nüshuSourcesScope = [\p{Block=Nushu} - \p{gc=Cn}] +Let $nüshuSourcesScope := [\p{Block=Nushu} - \p{gc=Cn}] $nüshuSourcesScope = [\p{gc=Lo} & \p{sc=Nushu}] $nüshuSourcesScope = \P{kSrc_NushuDuben=@none@} $nüshuSourcesScope = \P{kReading=@none@} # Egyptian hieroglyph invariants -Let $unikemetScope = [\p{Block=/^Egyptian.Hieroglyphs/} - \p{gc=Cn}] +Let $unikemetScope := [\p{Block=/^Egyptian.Hieroglyphs/} - \p{gc=Cn}] $unikemetScope = [ [\p{gc=Lo} & \p{sc=Egyp}] - \p{Name=/^EGYPTIAN HIEROGLYPH (FULL |HALF |TALL |WIDE )?(BLANK|LOST SIGN)$/} ] $unikemetScope = \P{kEH_Cat=@none@} $unikemetScope = \P{kEH_UniK=@none@}