From 98d36cbd3b68efcea56f1003ebafd83c71a1d182 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 8 Jan 2025 16:45:51 -0600 Subject: [PATCH] CLDR-16836 kbd: ebnf: update BNF per review - remove illegal ctrls - reorder members - cleanup --- docs/ldml/tr35-keyboards.md | 71 ++++++++++----------- keyboards/abnf/transform-from-required.abnf | 35 +++++----- keyboards/abnf/transform-to-required.abnf | 49 ++++++++------ 3 files changed, 81 insertions(+), 74 deletions(-) diff --git a/docs/ldml/tr35-keyboards.md b/docs/ldml/tr35-keyboards.md index 5737bbe7712..1c7eb52eef1 100644 --- a/docs/ldml/tr35-keyboards.md +++ b/docs/ldml/tr35-keyboards.md @@ -2438,7 +2438,7 @@ quark ::= non-group | group non-group ::= simple-matcher - | codepointseq + | escaped-codepoints | variable variable ::= string-variable | set-variable @@ -2446,15 +2446,16 @@ string-variable ::= '${' var-id '}' set-variable ::= '$[' var-id ']' +var-id ::= IDCHAR+ group ::= capturing-group | non-capturing-group quantifier ::= bounded-quantifier | '?' -codepointseq - ::= '\' 'u' '{' cphexseq '}' -codepoint - ::= '\' 'u' '{' cphexseq '}' +escaped-codepoints + ::= '\' 'u' '{' codepoints-hex '}' +escaped-codepoint + ::= '\' 'u' '{' codepoint-hex '}' bounded-quantifier ::= '{' DIGIT ',' DIGIT '}' non-capturing-group @@ -2464,8 +2465,10 @@ capturing-group catoms ::= catom+ catom ::= cquark quantifier? cquark ::= non-group -cphexseq ::= cphex ( ' ' cphex )* -cphex ::= LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG LHEXDIG? )? )? )? )? +codepoints-hex + ::= codepoint-hex ( ' ' codepoint-hex )* +codepoint-hex + ::= LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG LHEXDIG? )? )? )? )? simple-matcher ::= text-char | class @@ -2478,7 +2481,6 @@ match-named-marker ::= '\m{' marker-id '}' marker-id ::= NMTOKEN -var-id ::= IDCHAR+ class ::= fixed-class | set-class fixed-class @@ -2509,7 +2511,7 @@ set-member char-range ::= range-edge '-' range-edge range-edge - ::= codepoint + ::= escaped-codepoint | range-char set-negator ::= '^'? @@ -2528,8 +2530,7 @@ range-char | '{' | '}' content-char - ::= ASCII-CTRLS - | ASCII-PUNCT + ::= ASCII-PUNCT | ALPHA | DIGIT | NON-ASCII @@ -2542,8 +2543,6 @@ ws ::= [ #x3000] IDCHAR ::= ALPHA | DIGIT | '_' -ASCII-CTRLS - ::= [#x1-#x8#xB-#xC#xE-#x1F] ASCII-PUNCT ::= [!-#%-',/;->_`#x7E-#x7F] NON-ASCII @@ -2594,29 +2593,13 @@ atoms ::= atom* atom ::= replacement-char | escaped-char | group-reference - | codepointseq + | escaped-codepoints | named-marker | string-variable | mapped-set -string-variable - ::= '${' var-id '}' -group-reference - ::= '$' DIGIT -mapped-set - ::= '$[1:' var-id ']' -codepointseq - ::= '\' 'u' '{' cphexseq '}' -cphexseq ::= cphex ( ' ' cphex )* -cphex ::= LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG LHEXDIG? )? )? )? )? -named-marker - ::= '\m{' marker-id '}' -marker-id - ::= NMTOKEN -var-id ::= IDCHAR+ replacement-char ::= content-char | ws - | escaped-char | '-' | ':' | '(' @@ -2631,15 +2614,31 @@ replacement-char | '{' | '}' | '|' +escaped-char + ::= '\' ( '\' | '$' ) + | '$$' +group-reference + ::= '$' DIGIT +escaped-codepoints + ::= '\' 'u' '{' codepoints-hex '}' +codepoints-hex + ::= codepoint-hex ( ' ' codepoint-hex )* +codepoint-hex + ::= LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG LHEXDIG? )? )? )? )? +named-marker + ::= '\m{' marker-id '}' +marker-id + ::= NMTOKEN +string-variable + ::= '${' var-id '}' +var-id ::= IDCHAR+ +mapped-set + ::= '$[1:' var-id ']' content-char - ::= ASCII-CTRLS - | ASCII-PUNCT + ::= ASCII-PUNCT | ALPHA | DIGIT | NON-ASCII -escaped-char - ::= '\' ( '\' | '$' ) - | '$$' ws ::= [ #x3000] | HTAB | CR @@ -2647,8 +2646,6 @@ ws ::= [ #x3000] IDCHAR ::= ALPHA | DIGIT | '_' -ASCII-CTRLS - ::= [#x1-#x8#xB-#xC#xE-#x1F] ASCII-PUNCT ::= [!-#%-',/;->_`#x7E-#x7F] NON-ASCII diff --git a/keyboards/abnf/transform-from-required.abnf b/keyboards/abnf/transform-from-required.abnf index 5e97a6ab232..4e02db22f26 100644 --- a/keyboards/abnf/transform-from-required.abnf +++ b/keyboards/abnf/transform-from-required.abnf @@ -12,30 +12,38 @@ from-match = start-context atoms / atoms +; special marker anchoring to the start of context start-context = "^" -; an empty match is not allowed. +; sequence of items for input match. note that empty is not allowed, must be at least one atom. atoms = atom *(disjunction atom / atom) +; for use with or disjunction = "|" +; a 'quark' is the matching part of an atom, and then a quantifier atom = quark quantifier / quark +; quark can be a grouping or non grouping quark = non-group / group -non-group = simple-matcher / codepointseq / variable +non-group = simple-matcher / escaped-codepoints / variable variable = string-variable / set-variable string-variable = "${" var-id "}" + set-variable = "$[" var-id "]" +; variable ID +var-id = 1*32IDCHAR + group = capturing-group / non-capturing-group quantifier = bounded-quantifier / optional-quantifier -codepointseq = backslash "u" "{" cphexseq "}" -codepoint = backslash "u" "{" cphexseq "}" +escaped-codepoints = backslash "u" "{" codepoints-hex "}" +escaped-codepoint = backslash "u" "{" codepoint-hex "}" bounded-quantifier = "{" DIGIT "," DIGIT "}" optional-quantifier = "?" @@ -54,10 +62,10 @@ catom = cquark / cquark quantifier cquark = non-group ; multiple hex codepoints -cphexseq = cphex *(SP cphex) +codepoints-hex = codepoint-hex *(SP codepoint-hex) ; one hex codepoint (1-6 digits) -cphex = 1*6LHEXDIG +codepoint-hex = 1*6LHEXDIG simple-matcher = text-char / class / match-any-codepoint / match-marker @@ -66,11 +74,8 @@ match-any-codepoint = "." match-marker = match-any-marker / match-named-marker match-any-marker = "\m{.}" match-named-marker = "\m{" marker-id "}" - ; marker id is nmtoken, but may be UAX31 in the future. marker-id = NMTOKEN -; variable ID -var-id = 1*32IDCHAR class = fixed-class / set-class @@ -82,7 +87,7 @@ set-class = "[" set-negator set-members "]" set-members = set-member *(set-member) set-member = text-char / char-range / match-marker char-range = range-edge "-" range-edge -range-edge = codepoint / range-char +range-edge = escaped-codepoint / range-char set-negator = "^" / "" ; Restrictions on characters in various contexts @@ -92,7 +97,7 @@ text-char = content-char / ws / escaped-char / "-" / ":" ; text in a range sequence range-char = content-char / ws / escaped-char / "."/ "|" / "{" / "}" ; group for everything BUT syntax chars. -content-char = ASCII-CTRLS / ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII +content-char = ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII ; Character escapes escaped-char = backslash ( backslash / "{" / "|" / "}" ) @@ -101,9 +106,9 @@ backslash = %x5C ; U+005C REVERSE SOLIDUS "\" ws = SP / HTAB / CR / LF / %x3000 IDCHAR = ALPHA / DIGIT / "_" -ASCII-CTRLS = %x01-08 ; omit NULL (%x00), HTAB (%x09) and LF (%x0A) - / %x0B-0C ; omit CR (%x0D) - / %x0E-1F ; omit SP (%x20) +; ASCII-CTRLS = %x01-08 ; omit NULL (%x00), HTAB (%x09) and LF (%x0A) +; / %x0B-0C ; omit CR (%x0D) +; / %x0E-1F ; omit SP (%x20) ASCII-PUNCT = %x21-23 ; omit DOLLAR / %x25-27 ; omit () * + / %x2C ; omit . (%x2E) and - (%x2D) @@ -131,6 +136,4 @@ NAMESTARTCHAR = ":" / ALPHA / "_" / %xC0-D6 / %xD8-F6 / %xF8-2FF / %x370-37 NAMESTARTCHAR =/ %x10000-10FFFF ; SKIP-NODE-ABNF: TODO: NAMECHAR = NAMESTARTCHAR / "-" / "." / DIGIT / %xB7 / %x0300-036F / %x203F-2040 -; NAME = NAMESTARTCHAR *(NAMECHAR) NMTOKEN = 1*NAMECHAR -; NMTOKENS = NMTOKEN *(SP NMTOKEN) diff --git a/keyboards/abnf/transform-to-required.abnf b/keyboards/abnf/transform-to-required.abnf index da5c7e47a34..25b16d74dbe 100644 --- a/keyboards/abnf/transform-to-required.abnf +++ b/keyboards/abnf/transform-to-required.abnf @@ -11,51 +11,60 @@ ; Also note that a string may match this ABNF but be invalid according to the spec - which see. to-replacement = atoms +; a sequence of items for the output production atoms = *(atom) -atom = replacement-char / escaped-char / group-reference / codepointseq / named-marker / string-variable / mapped-set +; each atom can be one of several things +atom = replacement-char / escaped-char / group-reference / escaped-codepoints / named-marker / string-variable / mapped-set -string-variable = "${" var-id "}" +; normal text being output +replacement-char = content-char / ws / "-" / ":" / "(" / ")" / "." / "*" / "+" / "?" / "[" / "]" / "^" / "{" / "}" / "|" -group-reference = "$" DIGIT +; Character escapes +escaped-char = backslash ( backslash / "$" ) / "$$" -mapped-set = "$[1:" var-id "]" +; reference to a capture group +group-reference = "$" DIGIT -codepointseq = backslash "u" "{" cphexseq "}" +; hex codepoint such as \u{01234} +escaped-codepoints = backslash "u" "{" codepoints-hex "}" ; multiple hex codepoints -cphexseq = cphex *(SP cphex) +codepoints-hex = codepoint-hex *(SP codepoint-hex) ; one hex codepoint (1-6 digits) -cphex = 1*6LHEXDIG +codepoint-hex = 1*6LHEXDIG +; a specific marker ID. named-marker = "\m{" marker-id "}" ; marker id is nmtoken, but may be UAX31 in the future. marker-id = NMTOKEN + + +; substitution of a string variable +string-variable = "${" var-id "}" + ; variable ID var-id = 1*32IDCHAR -; fixed-class = backslash fixed-class-char - -; fixed-class-char = "s" / "S" / "t" / "r" / "n" / "f" / "v" / backslash / "$" / "d" / "w" / "D" / "W" / "0" +; special case for a mapped set variable +mapped-set = "$[1:" var-id "]" -; normal text -replacement-char = content-char / ws / escaped-char / "-" / ":" / "(" / ")" / "." / "*" / "+" / "?" / "[" / "]" / "^" / "{" / "}" / "|" ; group for everything BUT syntax chars. -content-char = ASCII-CTRLS / ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII - -; Character escapes -escaped-char = backslash ( backslash / "$" ) / "$$" +content-char = ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII +; \ backslash = %x5C ; U+005C REVERSE SOLIDUS "\" + +; whitespace ws = SP / HTAB / CR / LF / %x3000 IDCHAR = ALPHA / DIGIT / "_" ; below is same as transform-from for maintenance -ASCII-CTRLS = %x01-08 ; omit NULL (%x00), HTAB (%x09) and LF (%x0A) - / %x0B-0C ; omit CR (%x0D) - / %x0E-1F ; omit SP (%x20) +; ASCII-CTRLS = %x01-08 ; omit NULL (%x00), HTAB (%x09) and LF (%x0A) +; / %x0B-0C ; omit CR (%x0D) +; / %x0E-1F ; omit SP (%x20) ASCII-PUNCT = %x21-23 ; omit DOLLAR / %x25-27 ; omit () * + / %x2C ; omit . (%x2E) and - (%x2D) @@ -83,6 +92,4 @@ NAMESTARTCHAR = ":" / ALPHA / "_" / %xC0-D6 / %xD8-F6 / %xF8-2FF / %x370-37 NAMESTARTCHAR =/ %x10000-10FFFF ; SKIP-NODE-ABNF: TODO: NAMECHAR = NAMESTARTCHAR / "-" / "." / DIGIT / %xB7 / %x0300-036F / %x203F-2040 -; NAME = NAMESTARTCHAR *(NAMECHAR) NMTOKEN = 1*NAMECHAR -; NMTOKENS = NMTOKEN *(SP NMTOKEN)