grammar/case_rule: extend the testcase to check non-ASCII tokens

The logic of case/match lexing rules may be complex when working on source buffers encoded using varying length charsets such as UTF-8. Extend this testcase so that the "backwards codepoint lookup" behavior is exercised with a multi-bytes codepoint.
AdaCore · Mar 13, 2024 · 4048175 · 4048175
1 parent dbad162
commit 4048175
Showing 3 changed files with 11 additions and 1 deletion.
diff --git a/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt b/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt
@@ -2,7 +2,7 @@ lexer foo_lexer {
 
     char
     dot <- "."
-    id <- p"[a-zA-Z]+"
+    id <- p"[a-zA-Zé🙂]+"
     tick <- "'"
     newline <- p"\n"
 

diff --git a/testsuite/tests/grammar/case_rule/main.py b/testsuite/tests/grammar/case_rule/main.py
@@ -10,6 +10,7 @@
     ('simple-attr', "a'b"),
     ('char-dot', "'a'.b"),
     ('id-char', "a'b'"),
+    ('unicode-id-char', "\xe9'\U0001f642'"),
 ):
     print('== {} =='.format(label))
     u = ctx.get_from_buffer('{}.txt'.format(label), text)

diff --git a/testsuite/tests/grammar/case_rule/test.out b/testsuite/tests/grammar/case_rule/test.out
@@ -24,5 +24,14 @@ main.py: Running...
 <Token Tick "'" at 1:4-1:5>
 <Token Termination at 1:5-1:5>
 
+== unicode-id-char ==
+1:5-1:5: Expected Id, got Termination
+--
+<Token Id 'é' at 1:1-1:2>
+<Token Tick "'" at 1:2-1:3>
+<Token Id '🙂' at 1:3-1:4>
+<Token Tick "'" at 1:4-1:5>
+<Token Termination at 1:5-1:5>
+
 main.py: Done.
 Done