Skip to content

Commit

Permalink
CLDR-16836 kbd: ebnf: update BNF per review
Browse files Browse the repository at this point in the history
- remove illegal ctrls
- reorder members
- cleanup
  • Loading branch information
srl295 committed Jan 8, 2025
1 parent 5dfac4f commit 98d36cb
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 74 deletions.
71 changes: 34 additions & 37 deletions docs/ldml/tr35-keyboards.md
Original file line number Diff line number Diff line change
Expand Up @@ -2438,23 +2438,24 @@ quark ::= non-group
| group
non-group
::= simple-matcher
| codepointseq
| escaped-codepoints
| variable
variable ::= string-variable
| set-variable
string-variable
::= '${' var-id '}'
set-variable
::= '$[' var-id ']'
var-id ::= IDCHAR+
group ::= capturing-group
| non-capturing-group
quantifier
::= bounded-quantifier
| '?'
codepointseq
::= '\' 'u' '{' cphexseq '}'
codepoint
::= '\' 'u' '{' cphexseq '}'
escaped-codepoints
::= '\' 'u' '{' codepoints-hex '}'
escaped-codepoint
::= '\' 'u' '{' codepoint-hex '}'
bounded-quantifier
::= '{' DIGIT ',' DIGIT '}'
non-capturing-group
Expand All @@ -2464,8 +2465,10 @@ capturing-group
catoms ::= catom+
catom ::= cquark quantifier?
cquark ::= non-group
cphexseq ::= cphex ( ' ' cphex )*
cphex ::= LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG LHEXDIG? )? )? )? )?
codepoints-hex
::= codepoint-hex ( ' ' codepoint-hex )*
codepoint-hex
::= LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG LHEXDIG? )? )? )? )?
simple-matcher
::= text-char
| class
Expand All @@ -2478,7 +2481,6 @@ match-named-marker
::= '\m{' marker-id '}'
marker-id
::= NMTOKEN
var-id ::= IDCHAR+
class ::= fixed-class
| set-class
fixed-class
Expand Down Expand Up @@ -2509,7 +2511,7 @@ set-member
char-range
::= range-edge '-' range-edge
range-edge
::= codepoint
::= escaped-codepoint
| range-char
set-negator
::= '^'?
Expand All @@ -2528,8 +2530,7 @@ range-char
| '{'
| '}'
content-char
::= ASCII-CTRLS
| ASCII-PUNCT
::= ASCII-PUNCT
| ALPHA
| DIGIT
| NON-ASCII
Expand All @@ -2542,8 +2543,6 @@ ws ::= [ #x3000]
IDCHAR ::= ALPHA
| DIGIT
| '_'
ASCII-CTRLS
::= [#x1-#x8#xB-#xC#xE-#x1F]
ASCII-PUNCT
::= [!-#%-',/;->_`#x7E-#x7F]
NON-ASCII
Expand Down Expand Up @@ -2594,29 +2593,13 @@ atoms ::= atom*
atom ::= replacement-char
| escaped-char
| group-reference
| codepointseq
| escaped-codepoints
| named-marker
| string-variable
| mapped-set
string-variable
::= '${' var-id '}'
group-reference
::= '$' DIGIT
mapped-set
::= '$[1:' var-id ']'
codepointseq
::= '\' 'u' '{' cphexseq '}'
cphexseq ::= cphex ( ' ' cphex )*
cphex ::= LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG LHEXDIG? )? )? )? )?
named-marker
::= '\m{' marker-id '}'
marker-id
::= NMTOKEN
var-id ::= IDCHAR+
replacement-char
::= content-char
| ws
| escaped-char
| '-'
| ':'
| '('
Expand All @@ -2631,24 +2614,38 @@ replacement-char
| '{'
| '}'
| '|'
escaped-char
::= '\' ( '\' | '$' )
| '$$'
group-reference
::= '$' DIGIT
escaped-codepoints
::= '\' 'u' '{' codepoints-hex '}'
codepoints-hex
::= codepoint-hex ( ' ' codepoint-hex )*
codepoint-hex
::= LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG ( LHEXDIG LHEXDIG? )? )? )? )?
named-marker
::= '\m{' marker-id '}'
marker-id
::= NMTOKEN
string-variable
::= '${' var-id '}'
var-id ::= IDCHAR+
mapped-set
::= '$[1:' var-id ']'
content-char
::= ASCII-CTRLS
| ASCII-PUNCT
::= ASCII-PUNCT
| ALPHA
| DIGIT
| NON-ASCII
escaped-char
::= '\' ( '\' | '$' )
| '$$'
ws ::= [ #x3000]
| HTAB
| CR
| LF
IDCHAR ::= ALPHA
| DIGIT
| '_'
ASCII-CTRLS
::= [#x1-#x8#xB-#xC#xE-#x1F]
ASCII-PUNCT
::= [!-#%-',/;->_`#x7E-#x7F]
NON-ASCII
Expand Down
35 changes: 19 additions & 16 deletions keyboards/abnf/transform-from-required.abnf
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,38 @@

from-match = start-context atoms / atoms

; special marker anchoring to the start of context
start-context = "^"

; an empty match is not allowed.
; sequence of items for input match. note that empty is not allowed, must be at least one atom.
atoms = atom *(disjunction atom / atom)

; for use with or
disjunction = "|"

; a 'quark' is the matching part of an atom, and then a quantifier
atom = quark quantifier / quark

; quark can be a grouping or non grouping
quark = non-group / group

non-group = simple-matcher / codepointseq / variable
non-group = simple-matcher / escaped-codepoints / variable

variable = string-variable / set-variable

string-variable = "${" var-id "}"

set-variable = "$[" var-id "]"

; variable ID
var-id = 1*32IDCHAR

group = capturing-group / non-capturing-group

quantifier = bounded-quantifier / optional-quantifier

codepointseq = backslash "u" "{" cphexseq "}"
codepoint = backslash "u" "{" cphexseq "}"
escaped-codepoints = backslash "u" "{" codepoints-hex "}"
escaped-codepoint = backslash "u" "{" codepoint-hex "}"

bounded-quantifier = "{" DIGIT "," DIGIT "}"
optional-quantifier = "?"
Expand All @@ -54,10 +62,10 @@ catom = cquark / cquark quantifier
cquark = non-group

; multiple hex codepoints
cphexseq = cphex *(SP cphex)
codepoints-hex = codepoint-hex *(SP codepoint-hex)

; one hex codepoint (1-6 digits)
cphex = 1*6LHEXDIG
codepoint-hex = 1*6LHEXDIG

simple-matcher = text-char / class / match-any-codepoint / match-marker

Expand All @@ -66,11 +74,8 @@ match-any-codepoint = "."
match-marker = match-any-marker / match-named-marker
match-any-marker = "\m{.}"
match-named-marker = "\m{" marker-id "}"

; marker id is nmtoken, but may be UAX31 in the future.
marker-id = NMTOKEN
; variable ID
var-id = 1*32IDCHAR

class = fixed-class / set-class

Expand All @@ -82,7 +87,7 @@ set-class = "[" set-negator set-members "]"
set-members = set-member *(set-member)
set-member = text-char / char-range / match-marker
char-range = range-edge "-" range-edge
range-edge = codepoint / range-char
range-edge = escaped-codepoint / range-char
set-negator = "^" / ""

; Restrictions on characters in various contexts
Expand All @@ -92,7 +97,7 @@ text-char = content-char / ws / escaped-char / "-" / ":"
; text in a range sequence
range-char = content-char / ws / escaped-char / "."/ "|" / "{" / "}"
; group for everything BUT syntax chars.
content-char = ASCII-CTRLS / ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII
content-char = ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII

; Character escapes
escaped-char = backslash ( backslash / "{" / "|" / "}" )
Expand All @@ -101,9 +106,9 @@ backslash = %x5C ; U+005C REVERSE SOLIDUS "\"
ws = SP / HTAB / CR / LF / %x3000

IDCHAR = ALPHA / DIGIT / "_"
ASCII-CTRLS = %x01-08 ; omit NULL (%x00), HTAB (%x09) and LF (%x0A)
/ %x0B-0C ; omit CR (%x0D)
/ %x0E-1F ; omit SP (%x20)
; ASCII-CTRLS = %x01-08 ; omit NULL (%x00), HTAB (%x09) and LF (%x0A)
; / %x0B-0C ; omit CR (%x0D)
; / %x0E-1F ; omit SP (%x20)
ASCII-PUNCT = %x21-23 ; omit DOLLAR
/ %x25-27 ; omit () * +
/ %x2C ; omit . (%x2E) and - (%x2D)
Expand Down Expand Up @@ -131,6 +136,4 @@ NAMESTARTCHAR = ":" / ALPHA / "_" / %xC0-D6 / %xD8-F6 / %xF8-2FF / %x370-37
NAMESTARTCHAR =/ %x10000-10FFFF ; SKIP-NODE-ABNF: TODO: <https://github.com/hildjj/node-abnf/issues/25>

NAMECHAR = NAMESTARTCHAR / "-" / "." / DIGIT / %xB7 / %x0300-036F / %x203F-2040
; NAME = NAMESTARTCHAR *(NAMECHAR)
NMTOKEN = 1*NAMECHAR
; NMTOKENS = NMTOKEN *(SP NMTOKEN)
49 changes: 28 additions & 21 deletions keyboards/abnf/transform-to-required.abnf
Original file line number Diff line number Diff line change
Expand Up @@ -11,51 +11,60 @@
; Also note that a string may match this ABNF but be invalid according to the spec - which see.
to-replacement = atoms

; a sequence of items for the output production
atoms = *(atom)

atom = replacement-char / escaped-char / group-reference / codepointseq / named-marker / string-variable / mapped-set
; each atom can be one of several things
atom = replacement-char / escaped-char / group-reference / escaped-codepoints / named-marker / string-variable / mapped-set

string-variable = "${" var-id "}"
; normal text being output
replacement-char = content-char / ws / "-" / ":" / "(" / ")" / "." / "*" / "+" / "?" / "[" / "]" / "^" / "{" / "}" / "|"

group-reference = "$" DIGIT
; Character escapes
escaped-char = backslash ( backslash / "$" ) / "$$"

mapped-set = "$[1:" var-id "]"
; reference to a capture group
group-reference = "$" DIGIT

codepointseq = backslash "u" "{" cphexseq "}"
; hex codepoint such as \u{01234}
escaped-codepoints = backslash "u" "{" codepoints-hex "}"

; multiple hex codepoints
cphexseq = cphex *(SP cphex)
codepoints-hex = codepoint-hex *(SP codepoint-hex)

; one hex codepoint (1-6 digits)
cphex = 1*6LHEXDIG
codepoint-hex = 1*6LHEXDIG

; a specific marker ID.
named-marker = "\m{" marker-id "}"

; marker id is nmtoken, but may be UAX31 in the future.
marker-id = NMTOKEN


; substitution of a string variable
string-variable = "${" var-id "}"

; variable ID
var-id = 1*32IDCHAR

; fixed-class = backslash fixed-class-char

; fixed-class-char = "s" / "S" / "t" / "r" / "n" / "f" / "v" / backslash / "$" / "d" / "w" / "D" / "W" / "0"
; special case for a mapped set variable
mapped-set = "$[1:" var-id "]"

; normal text
replacement-char = content-char / ws / escaped-char / "-" / ":" / "(" / ")" / "." / "*" / "+" / "?" / "[" / "]" / "^" / "{" / "}" / "|"
; group for everything BUT syntax chars.
content-char = ASCII-CTRLS / ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII

; Character escapes
escaped-char = backslash ( backslash / "$" ) / "$$"
content-char = ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII

; \
backslash = %x5C ; U+005C REVERSE SOLIDUS "\"

; whitespace
ws = SP / HTAB / CR / LF / %x3000

IDCHAR = ALPHA / DIGIT / "_"
; below is same as transform-from for maintenance
ASCII-CTRLS = %x01-08 ; omit NULL (%x00), HTAB (%x09) and LF (%x0A)
/ %x0B-0C ; omit CR (%x0D)
/ %x0E-1F ; omit SP (%x20)
; ASCII-CTRLS = %x01-08 ; omit NULL (%x00), HTAB (%x09) and LF (%x0A)
; / %x0B-0C ; omit CR (%x0D)
; / %x0E-1F ; omit SP (%x20)
ASCII-PUNCT = %x21-23 ; omit DOLLAR
/ %x25-27 ; omit () * +
/ %x2C ; omit . (%x2E) and - (%x2D)
Expand Down Expand Up @@ -83,6 +92,4 @@ NAMESTARTCHAR = ":" / ALPHA / "_" / %xC0-D6 / %xD8-F6 / %xF8-2FF / %x370-37
NAMESTARTCHAR =/ %x10000-10FFFF ; SKIP-NODE-ABNF: TODO: <https://github.com/hildjj/node-abnf/issues/25>

NAMECHAR = NAMESTARTCHAR / "-" / "." / DIGIT / %xB7 / %x0300-036F / %x203F-2040
; NAME = NAMESTARTCHAR *(NAMECHAR)
NMTOKEN = 1*NAMECHAR
; NMTOKENS = NMTOKEN *(SP NMTOKEN)

0 comments on commit 98d36cb

Please sign in to comment.