From 4048175e62bcb031bf2dfef7280e21ce092ac46a Mon Sep 17 00:00:00 2001 From: Pierre-Marie de Rodat Date: Wed, 28 Jun 2023 11:48:24 +0000 Subject: [PATCH] grammar/case_rule: extend the testcase to check non-ASCII tokens The logic of case/match lexing rules may be complex when working on source buffers encoded using varying length charsets such as UTF-8. Extend this testcase so that the "backwards codepoint lookup" behavior is exercised with a multi-bytes codepoint. --- .../tests/grammar/case_rule/expected_concrete_syntax.lkt | 2 +- testsuite/tests/grammar/case_rule/main.py | 1 + testsuite/tests/grammar/case_rule/test.out | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt b/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt index 3661a65a9..13a512eb7 100644 --- a/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt +++ b/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt @@ -2,7 +2,7 @@ lexer foo_lexer { char dot <- "." - id <- p"[a-zA-Z]+" + id <- p"[a-zA-ZĂ©đŸ™‚]+" tick <- "'" newline <- p"\n" diff --git a/testsuite/tests/grammar/case_rule/main.py b/testsuite/tests/grammar/case_rule/main.py index 91535a764..2e1c208d2 100644 --- a/testsuite/tests/grammar/case_rule/main.py +++ b/testsuite/tests/grammar/case_rule/main.py @@ -10,6 +10,7 @@ ('simple-attr', "a'b"), ('char-dot', "'a'.b"), ('id-char', "a'b'"), + ('unicode-id-char', "\xe9'\U0001f642'"), ): print('== {} =='.format(label)) u = ctx.get_from_buffer('{}.txt'.format(label), text) diff --git a/testsuite/tests/grammar/case_rule/test.out b/testsuite/tests/grammar/case_rule/test.out index b9287353b..76ce4c618 100644 --- a/testsuite/tests/grammar/case_rule/test.out +++ b/testsuite/tests/grammar/case_rule/test.out @@ -24,5 +24,14 @@ main.py: Running... +== unicode-id-char == +1:5-1:5: Expected Id, got Termination +-- + + + + + + main.py: Done. Done