From 98d36cbd3b68efcea56f1003ebafd83c71a1d182 Mon Sep 17 00:00:00 2001
From: "Steven R. Loomis" <srl295@gmail.com>
Date: Wed, 8 Jan 2025 16:45:51 -0600
Subject: [PATCH] CLDR-16836 kbd: ebnf: update BNF per review

- remove illegal ctrls
- reorder members
- cleanup
---
 docs/ldml/tr35-keyboards.md                 | 71 ++++++++++-----------
 keyboards/abnf/transform-from-required.abnf | 35 +++++-----
 keyboards/abnf/transform-to-required.abnf   | 49 ++++++++------
 3 files changed, 81 insertions(+), 74 deletions(-)

diff --git a/docs/ldml/tr35-keyboards.md b/docs/ldml/tr35-keyboards.md
index 5737bbe7712..1c7eb52eef1 100644
--- a/docs/ldml/tr35-keyboards.md
+++ b/docs/ldml/tr35-keyboards.md
@@ -2438,7 +2438,7 @@ quark    ::= non-group
            | group
 non-group
          ::= simple-matcher
-           | codepointseq
+           | escaped-codepoints
            | variable
 variable ::= string-variable
            | set-variable
@@ -2446,15 +2446,16 @@ string-variable
          ::= '${' var-id '}'
 set-variable
          ::= '$[' var-id ']'
+var-id   ::= IDCHAR+
 group    ::= capturing-group
            | non-capturing-group
 quantifier
          ::= bounded-quantifier
            | '?'
-codepointseq
-         ::= '\' 'u' '{' cphexseq '}'
-codepoint
-         ::= '\' 'u' '{' cphexseq '}'
+escaped-codepoints
+         ::= '\' 'u' '{' codepoints-hex '}'
+escaped-codepoint
+         ::= '\' 'u' '{' codepoint-hex '}'
 bounded-quantifier
          ::= '{' DIGIT ',' DIGIT '}'
 non-capturing-group
@@ -2464,8 +2465,10 @@ capturing-group
 catoms   ::= catom+
 catom    ::= cquark quantifier?
 cquark   ::= non-group
-cphexseq ::= cphex ( ' ' cphex )*
-cphex    ::= LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG LHEXDIG? )? )? )? )?
+codepoints-hex
+         ::= codepoint-hex ( ' ' codepoint-hex )*
+codepoint-hex
+         ::= LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG LHEXDIG? )? )? )? )?
 simple-matcher
          ::= text-char
            | class
@@ -2478,7 +2481,6 @@ match-named-marker
          ::= '\m{' marker-id '}'
 marker-id
          ::= NMTOKEN
-var-id   ::= IDCHAR+
 class    ::= fixed-class
            | set-class
 fixed-class
@@ -2509,7 +2511,7 @@ set-member
 char-range
          ::= range-edge '-' range-edge
 range-edge
-         ::= codepoint
+         ::= escaped-codepoint
            | range-char
 set-negator
          ::= '^'?
@@ -2528,8 +2530,7 @@ range-char
            | '{'
            | '}'
 content-char
-         ::= ASCII-CTRLS
-           | ASCII-PUNCT
+         ::= ASCII-PUNCT
            | ALPHA
            | DIGIT
            | NON-ASCII
@@ -2542,8 +2543,6 @@ ws       ::= [ #x3000]
 IDCHAR   ::= ALPHA
            | DIGIT
            | '_'
-ASCII-CTRLS
-         ::= [#x1-#x8#xB-#xC#xE-#x1F]
 ASCII-PUNCT
          ::= [!-#%-',/;->_`#x7E-#x7F]
 NON-ASCII
@@ -2594,29 +2593,13 @@ atoms    ::= atom*
 atom     ::= replacement-char
            | escaped-char
            | group-reference
-           | codepointseq
+           | escaped-codepoints
            | named-marker
            | string-variable
            | mapped-set
-string-variable
-         ::= '${' var-id '}'
-group-reference
-         ::= '$' DIGIT
-mapped-set
-         ::= '$[1:' var-id ']'
-codepointseq
-         ::= '\' 'u' '{' cphexseq '}'
-cphexseq ::= cphex ( ' ' cphex )*
-cphex    ::= LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG LHEXDIG? )? )? )? )?
-named-marker
-         ::= '\m{' marker-id '}'
-marker-id
-         ::= NMTOKEN
-var-id   ::= IDCHAR+
 replacement-char
          ::= content-char
            | ws
-           | escaped-char
            | '-'
            | ':'
            | '('
@@ -2631,15 +2614,31 @@ replacement-char
            | '{'
            | '}'
            | '|'
+escaped-char
+         ::= '\' ( '\' | '$' )
+           | '$$'
+group-reference
+         ::= '$' DIGIT
+escaped-codepoints
+         ::= '\' 'u' '{' codepoints-hex '}'
+codepoints-hex
+         ::= codepoint-hex ( ' ' codepoint-hex )*
+codepoint-hex
+         ::= LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG LHEXDIG? )? )? )? )?
+named-marker
+         ::= '\m{' marker-id '}'
+marker-id
+         ::= NMTOKEN
+string-variable
+         ::= '${' var-id '}'
+var-id   ::= IDCHAR+
+mapped-set
+         ::= '$[1:' var-id ']'
 content-char
-         ::= ASCII-CTRLS
-           | ASCII-PUNCT
+         ::= ASCII-PUNCT
            | ALPHA
            | DIGIT
            | NON-ASCII
-escaped-char
-         ::= '\' ( '\' | '$' )
-           | '$$'
 ws       ::= [ #x3000]
            | HTAB
            | CR
@@ -2647,8 +2646,6 @@ ws       ::= [ #x3000]
 IDCHAR   ::= ALPHA
            | DIGIT
            | '_'
-ASCII-CTRLS
-         ::= [#x1-#x8#xB-#xC#xE-#x1F]
 ASCII-PUNCT
          ::= [!-#%-',/;->_`#x7E-#x7F]
 NON-ASCII
diff --git a/keyboards/abnf/transform-from-required.abnf b/keyboards/abnf/transform-from-required.abnf
index 5e97a6ab232..4e02db22f26 100644
--- a/keyboards/abnf/transform-from-required.abnf
+++ b/keyboards/abnf/transform-from-required.abnf
@@ -12,30 +12,38 @@
 
 from-match        = start-context atoms / atoms
 
+; special marker anchoring to the start of context
 start-context  = "^"
 
-; an empty match is not allowed.
+; sequence of items for input match. note that empty is not allowed, must be at least one atom.
 atoms             = atom *(disjunction atom / atom)
 
+; for use with or
 disjunction = "|"
 
+; a 'quark' is the matching part of an atom, and then a quantifier
 atom = quark quantifier / quark
 
+; quark can be a grouping or non grouping
 quark  = non-group / group
 
-non-group = simple-matcher / codepointseq / variable
+non-group = simple-matcher / escaped-codepoints / variable
 
 variable = string-variable / set-variable
 
 string-variable = "${" var-id "}"
+
 set-variable = "$[" var-id "]"
 
+; variable ID
+var-id = 1*32IDCHAR
+
 group = capturing-group / non-capturing-group
 
 quantifier    =  bounded-quantifier / optional-quantifier
 
-codepointseq           = backslash "u" "{" cphexseq "}"
-codepoint           = backslash "u" "{" cphexseq "}"
+escaped-codepoints           = backslash "u" "{" codepoints-hex "}"
+escaped-codepoint           = backslash "u" "{" codepoint-hex "}"
 
 bounded-quantifier = "{" DIGIT "," DIGIT "}"
 optional-quantifier =  "?"
@@ -54,10 +62,10 @@ catom = cquark / cquark quantifier
 cquark = non-group
 
 ; multiple hex codepoints
-cphexseq = cphex *(SP cphex)
+codepoints-hex = codepoint-hex *(SP codepoint-hex)
 
 ; one hex codepoint (1-6 digits)
-cphex =  1*6LHEXDIG
+codepoint-hex =  1*6LHEXDIG
 
 simple-matcher      = text-char / class / match-any-codepoint / match-marker
 
@@ -66,11 +74,8 @@ match-any-codepoint = "."
 match-marker = match-any-marker / match-named-marker
 match-any-marker = "\m{.}"
 match-named-marker = "\m{" marker-id "}"
-
 ; marker id is nmtoken, but may be UAX31 in the future.
 marker-id = NMTOKEN
-; variable ID
-var-id = 1*32IDCHAR
 
 class = fixed-class / set-class
 
@@ -82,7 +87,7 @@ set-class = "[" set-negator set-members "]"
 set-members = set-member *(set-member)
 set-member = text-char / char-range / match-marker
 char-range = range-edge "-" range-edge
-range-edge = codepoint / range-char
+range-edge = escaped-codepoint / range-char
 set-negator = "^" / ""
 
 ; Restrictions on characters in various contexts
@@ -92,7 +97,7 @@ text-char         = content-char / ws / escaped-char / "-" / ":"
 ; text in a range sequence
 range-char        = content-char / ws / escaped-char / "."/ "|" / "{" / "}"
 ; group for everything BUT syntax chars.
-content-char      = ASCII-CTRLS / ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII
+content-char      = ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII
 
 ; Character escapes
 escaped-char = backslash ( backslash / "{" / "|" / "}" )
@@ -101,9 +106,9 @@ backslash    = %x5C ; U+005C REVERSE SOLIDUS "\"
 ws = SP / HTAB / CR / LF / %x3000
 
 IDCHAR = ALPHA / DIGIT / "_"
-ASCII-CTRLS        = %x01-08       ; omit NULL (%x00), HTAB (%x09) and LF (%x0A)
-                  / %x0B-0C        ; omit CR (%x0D)
-                  / %x0E-1F        ; omit SP (%x20)
+; ASCII-CTRLS        = %x01-08       ; omit NULL (%x00), HTAB (%x09) and LF (%x0A)
+;                   / %x0B-0C        ; omit CR (%x0D)
+;                   / %x0E-1F        ; omit SP (%x20)
 ASCII-PUNCT        = %x21-23       ; omit DOLLAR
                   / %x25-27        ; omit () * +
                   / %x2C           ; omit . (%x2E) and - (%x2D)
@@ -131,6 +136,4 @@ NAMESTARTCHAR   =   	":" / ALPHA / "_" / %xC0-D6 / %xD8-F6 / %xF8-2FF / %x370-37
 NAMESTARTCHAR   =/  %x10000-10FFFF    ; SKIP-NODE-ABNF: TODO: <https://github.com/hildjj/node-abnf/issues/25>
 
 NAMECHAR	   =   	NAMESTARTCHAR / "-" / "." / DIGIT / %xB7 / %x0300-036F / %x203F-2040
-; NAME	   =   	NAMESTARTCHAR *(NAMECHAR)
 NMTOKEN	   =   	1*NAMECHAR
-; NMTOKENS	   =   	NMTOKEN *(SP NMTOKEN)
diff --git a/keyboards/abnf/transform-to-required.abnf b/keyboards/abnf/transform-to-required.abnf
index da5c7e47a34..25b16d74dbe 100644
--- a/keyboards/abnf/transform-to-required.abnf
+++ b/keyboards/abnf/transform-to-required.abnf
@@ -11,51 +11,60 @@
 ; Also note that a string may match this ABNF but be invalid according to the spec - which see.
 to-replacement          = atoms
 
+; a sequence of items for the output production
 atoms             = *(atom)
 
-atom = replacement-char / escaped-char / group-reference / codepointseq / named-marker / string-variable / mapped-set
+; each atom can be one of several things
+atom = replacement-char / escaped-char / group-reference / escaped-codepoints / named-marker / string-variable / mapped-set
 
-string-variable = "${" var-id "}"
+; normal text being output
+replacement-char         = content-char / ws / "-" / ":" / "(" / ")" / "." / "*" / "+" / "?" / "[" / "]" / "^" / "{" / "}" / "|"
 
-group-reference = "$" DIGIT
+; Character escapes
+escaped-char = backslash ( backslash / "$" ) / "$$"
 
-mapped-set = "$[1:" var-id "]"
+; reference to a capture group
+group-reference = "$" DIGIT
 
-codepointseq           = backslash "u" "{" cphexseq "}"
+; hex codepoint such as \u{01234}
+escaped-codepoints           = backslash "u" "{" codepoints-hex "}"
 
 ; multiple hex codepoints
-cphexseq = cphex *(SP cphex)
+codepoints-hex = codepoint-hex *(SP codepoint-hex)
 
 ; one hex codepoint (1-6 digits)
-cphex =  1*6LHEXDIG
+codepoint-hex =  1*6LHEXDIG
 
+; a specific marker ID.
 named-marker = "\m{" marker-id "}"
 
 ; marker id is nmtoken, but may be UAX31 in the future.
 marker-id = NMTOKEN
+
+
+; substitution of a string variable
+string-variable = "${" var-id "}"
+
 ; variable ID
 var-id = 1*32IDCHAR
 
-; fixed-class = backslash fixed-class-char
-
-; fixed-class-char = "s" / "S" / "t" / "r" / "n" / "f" / "v" / backslash / "$" / "d" / "w" / "D" / "W" / "0"
+; special case for a mapped set variable
+mapped-set = "$[1:" var-id "]"
 
-; normal text
-replacement-char         = content-char / ws / escaped-char / "-" / ":" / "(" / ")" / "." / "*" / "+" / "?" / "[" / "]" / "^" / "{" / "}" / "|"
 ; group for everything BUT syntax chars.
-content-char      = ASCII-CTRLS / ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII
-
-; Character escapes
-escaped-char = backslash ( backslash / "$" ) / "$$"
+content-char      =  ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII
 
+; \
 backslash    = %x5C ; U+005C REVERSE SOLIDUS "\"
+
+; whitespace
 ws = SP / HTAB / CR / LF / %x3000
 
 IDCHAR = ALPHA / DIGIT / "_"
 ; below is same as transform-from for maintenance
-ASCII-CTRLS        = %x01-08       ; omit NULL (%x00), HTAB (%x09) and LF (%x0A)
-                  / %x0B-0C        ; omit CR (%x0D)
-                  / %x0E-1F        ; omit SP (%x20)
+; ASCII-CTRLS        = %x01-08       ; omit NULL (%x00), HTAB (%x09) and LF (%x0A)
+;                   / %x0B-0C        ; omit CR (%x0D)
+;                   / %x0E-1F        ; omit SP (%x20)
 ASCII-PUNCT        = %x21-23       ; omit DOLLAR
                   / %x25-27        ; omit () * +
                   / %x2C           ; omit . (%x2E) and - (%x2D)
@@ -83,6 +92,4 @@ NAMESTARTCHAR   =   	":" / ALPHA / "_" / %xC0-D6 / %xD8-F6 / %xF8-2FF / %x370-37
 NAMESTARTCHAR   =/  %x10000-10FFFF    ; SKIP-NODE-ABNF: TODO: <https://github.com/hildjj/node-abnf/issues/25>
 
 NAMECHAR	   =   	NAMESTARTCHAR / "-" / "." / DIGIT / %xB7 / %x0300-036F / %x203F-2040
-; NAME	   =   	NAMESTARTCHAR *(NAMECHAR)
 NMTOKEN	   =   	1*NAMECHAR
-; NMTOKENS	   =   	NMTOKEN *(SP NMTOKEN)