From dbad162373a75a318a2336f1d3784d31443e4bbc Mon Sep 17 00:00:00 2001 From: Pierre-Marie de Rodat Date: Fri, 30 Jun 2023 13:56:36 +0000 Subject: [PATCH 1/2] misc/unicode: new testcase --- testsuite/tests/misc/unicode/empty.txt | 0 .../misc/unicode/expected_concrete_syntax.lkt | 16 +++ .../tests/misc/unicode/main-iso-8859-1.txt | 43 ++++++ testsuite/tests/misc/unicode/main.adb | 131 ++++++++++++++++++ testsuite/tests/misc/unicode/support.adb | 46 ++++++ testsuite/tests/misc/unicode/support.ads | 5 + testsuite/tests/misc/unicode/test.out | 76 ++++++++++ testsuite/tests/misc/unicode/test.py | 25 ++++ testsuite/tests/misc/unicode/test.yaml | 1 + 9 files changed, 343 insertions(+) create mode 100644 testsuite/tests/misc/unicode/empty.txt create mode 100644 testsuite/tests/misc/unicode/expected_concrete_syntax.lkt create mode 100644 testsuite/tests/misc/unicode/main-iso-8859-1.txt create mode 100644 testsuite/tests/misc/unicode/main.adb create mode 100644 testsuite/tests/misc/unicode/support.adb create mode 100644 testsuite/tests/misc/unicode/support.ads create mode 100644 testsuite/tests/misc/unicode/test.out create mode 100644 testsuite/tests/misc/unicode/test.py create mode 100644 testsuite/tests/misc/unicode/test.yaml diff --git a/testsuite/tests/misc/unicode/empty.txt b/testsuite/tests/misc/unicode/empty.txt new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/tests/misc/unicode/expected_concrete_syntax.lkt b/testsuite/tests/misc/unicode/expected_concrete_syntax.lkt new file mode 100644 index 000000000..e2c4e931d --- /dev/null +++ b/testsuite/tests/misc/unicode/expected_concrete_syntax.lkt @@ -0,0 +1,16 @@ +import lexer_example + +@with_lexer(foo_lexer) +grammar foo_grammar { + @main_rule main_rule <- list+(Example(@example StrLit(@string))) +} + +@abstract class FooNode implements Node[FooNode] { +} + +class Example: FooNode { + @parse_field f: StrLit +} + +class StrLit: FooNode implements TokenNode { +} diff --git a/testsuite/tests/misc/unicode/main-iso-8859-1.txt b/testsuite/tests/misc/unicode/main-iso-8859-1.txt new file mode 100644 index 000000000..10ad003f6 --- /dev/null +++ b/testsuite/tests/misc/unicode/main-iso-8859-1.txt @@ -0,0 +1,43 @@ +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # + +example "1É" + +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # + +example "1É2É" + +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # + +example "1É2É3É" + +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # + +example "1É2É3É4É" + +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # + +example "1É2É3É4É5É" + +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # + +example "1É2É3É4É5É6É" + +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # + +example "1É2É3É4É5É6É7É" + +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # + +example "1É2É3É4É5É6É7É8É" + +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # + +example "1É2É3É4É5É6É7É8É9É" + +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # + +example "1É2É3É4É5É6É7É8É9É0É" + +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # +# ééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééééé # diff --git a/testsuite/tests/misc/unicode/main.adb b/testsuite/tests/misc/unicode/main.adb new file mode 100644 index 000000000..acf907df2 --- /dev/null +++ b/testsuite/tests/misc/unicode/main.adb @@ -0,0 +1,131 @@ +with Ada.Text_IO; use Ada.Text_IO; + +with GNAT.Strings; use GNAT.Strings; +with GNATCOLL.Mmap; use GNATCOLL.Mmap; + +with Langkit_Support.File_Readers; use Langkit_Support.File_Readers; +with Langkit_Support.Slocs; use Langkit_Support.Slocs; +with Libfoolang.Analysis; use Libfoolang.Analysis; +with Libfoolang.Common; use Libfoolang.Common; + +with Support; use Support; + +procedure Main is + + Empty_File : constant String := "empty.txt"; + Empty_Buffer : aliased constant String := ""; + + Example_File : constant String := "main-iso-8859-1.txt"; + Example_Buffer : String_Access := Read_Whole_File (Example_File); + + procedure Check + (From_Buffer : Boolean := False; + Empty_File : Boolean := False; + Wrong_Encoding : Boolean := False; + With_File_Reader : Boolean := False); + + ----------- + -- Check -- + ----------- + + procedure Check + (From_Buffer : Boolean := False; + Empty_File : Boolean := False; + Wrong_Encoding : Boolean := False; + With_File_Reader : Boolean := False) + is + Charset : constant String := + (if Wrong_Encoding then "utf-8" else "iso-8859-1"); + Filename : constant String := + (if Empty_File then Main.Empty_File else Example_File); + Buffer : constant access constant String := + (if Empty_File then Empty_Buffer'Access else Example_Buffer); + + Ctx : Analysis_Context; + U : Analysis_Unit; + begin + -- Put some label for this check + + Put ("== "); + Put (if From_Buffer then "buffer" else "file"); + Put (" | "); + Put (if Empty_File then "empty-file" else "example-file"); + Put (" | "); + Put (if Wrong_Encoding then "wrong-encoding" else "correct-encoding"); + Put (" | "); + Put (if With_File_Reader then "file-reader" else "default"); + Put_Line (" =="); + New_Line; + + -- Parse the source according to requested settings + + Ctx := Create_Context + (File_Reader => (if With_File_Reader + then Get_File_Reader + else No_File_Reader_Reference)); + if From_Buffer then + U := Ctx.Get_From_Buffer + (Filename => Filename, + Charset => Charset, + Buffer => Buffer.all); + else + U := Ctx.Get_From_File + (Filename => Filename, Charset => Charset); + end if; + + -- Display parsing errors, if any + + if U.Has_Diagnostics then + Put_Line ("Errors:"); + for D of U.Diagnostics loop + Put_Line (" " & U.Format_GNU_Diagnostic (D)); + end loop; + New_Line; + end if; + + -- Summarize the content of the parsed unit + + if U.Root.Is_Null then + Put_Line ("No root node"); + else + Put_Line ("Root node children:" & U.Root.Children_Count'Image); + declare + D : constant Token_Data_Type := Data (U.First_Token); + begin + Put_Line + ("First token: " + & Kind (D)'Image + & " at " & Image (Sloc_Range (D))); + end; + declare + D : constant Token_Data_Type := Data (U.Last_Token); + begin + Put_Line + ("Last token: " + & Kind (D)'Image + & " at " & Image (Sloc_Range (D))); + end; + end if; + New_Line; + end Check; + +begin + -- Get_From_File + + Check; + Check (With_File_Reader => True); + + Check (Empty_File => True); + Check (Empty_File => True, With_File_Reader => True); + + Check (Wrong_Encoding => True); + Check (Wrong_Encoding => True, With_File_Reader => True); + + -- Get_From_Buffer + + Check (From_Buffer => True); + Check (From_Buffer => True, Empty_File => True); + Check (From_Buffer => True, Wrong_Encoding => True); + + Free (Example_Buffer); +end Main; diff --git a/testsuite/tests/misc/unicode/support.adb b/testsuite/tests/misc/unicode/support.adb new file mode 100644 index 000000000..b0387cb3e --- /dev/null +++ b/testsuite/tests/misc/unicode/support.adb @@ -0,0 +1,46 @@ +with Langkit_Support.Diagnostics; use Langkit_Support.Diagnostics; + +package body Support is + + type My_FR is new File_Reader_Interface with null record; + + overriding procedure Read + (Self : My_FR; + Filename : String; + Charset : String; + Read_BOM : Boolean; + Contents : out Decoded_File_Contents; + Diagnostics : in out Diagnostics_Vectors.Vector); + + overriding procedure Release (Self : in out My_FR) is null; + + ---------- + -- Read -- + ---------- + + overriding procedure Read + (Self : My_FR; + Filename : String; + Charset : String; + Read_BOM : Boolean; + Contents : out Decoded_File_Contents; + Diagnostics : in out Diagnostics_Vectors.Vector) + is + begin + Direct_Read (Filename, Charset, Read_BOM, Contents, Diagnostics); + if Diagnostics.Is_Empty and then Contents.Buffer.all'Length > 79 then + Contents.Buffer.all (Contents.First .. Contents.First + 79) := + (1 .. 80 => ' '); + end if; + end Read; + + --------------------- + -- Get_File_Reader -- + --------------------- + + function Get_File_Reader return File_Reader_Reference is + begin + return Create_File_Reader_Reference (My_FR'(null record)); + end Get_File_Reader; + +end Support; diff --git a/testsuite/tests/misc/unicode/support.ads b/testsuite/tests/misc/unicode/support.ads new file mode 100644 index 000000000..58088ac32 --- /dev/null +++ b/testsuite/tests/misc/unicode/support.ads @@ -0,0 +1,5 @@ +with Langkit_Support.File_Readers; use Langkit_Support.File_Readers; + +package Support is + function Get_File_Reader return File_Reader_Reference; +end Support; diff --git a/testsuite/tests/misc/unicode/test.out b/testsuite/tests/misc/unicode/test.out new file mode 100644 index 000000000..695ecd6de --- /dev/null +++ b/testsuite/tests/misc/unicode/test.out @@ -0,0 +1,76 @@ +== file | example-file | correct-encoding | default == + +Root node children: 10 +First token: FOO_COMMENT at 1:1-1:80 +Last token: FOO_TERMINATION at 44:1-44:1 + +== file | example-file | correct-encoding | file-reader == + +Root node children: 10 +First token: FOO_WHITESPACE at 1:1-1:81 +Last token: FOO_TERMINATION at 43:1-43:1 + +== file | empty-file | correct-encoding | default == + +Errors: + empty.txt:1:1: Expected 'example', got Termination + +Root node children: 0 +First token: FOO_TERMINATION at 1:1-1:1 +Last token: FOO_TERMINATION at 1:1-1:1 + +== file | empty-file | correct-encoding | file-reader == + +Errors: + empty.txt:1:1: Expected 'example', got Termination + +Root node children: 0 +First token: FOO_TERMINATION at 1:1-1:1 +Last token: FOO_TERMINATION at 1:1-1:1 + +== file | example-file | wrong-encoding | default == + +Errors: + main-iso-8859-1.txt:1:3: Could not decode source as "utf-8" + main-iso-8859-1.txt:1:1: Expected 'example', got Termination + +Root node children: 0 +First token: FOO_TERMINATION at 1:1-1:1 +Last token: FOO_TERMINATION at 1:1-1:1 + +== file | example-file | wrong-encoding | file-reader == + +Errors: + main-iso-8859-1.txt:1:3: Could not decode source as "utf-8" + main-iso-8859-1.txt:1:1: Expected 'example', got Termination + +Root node children: 0 +First token: FOO_TERMINATION at 1:1-1:1 +Last token: FOO_TERMINATION at 1:1-1:1 + +== buffer | example-file | correct-encoding | default == + +Root node children: 10 +First token: FOO_COMMENT at 1:1-1:80 +Last token: FOO_TERMINATION at 44:1-44:1 + +== buffer | empty-file | correct-encoding | default == + +Errors: + empty.txt:1:1: Expected 'example', got Termination + +Root node children: 0 +First token: FOO_TERMINATION at 1:1-1:1 +Last token: FOO_TERMINATION at 1:1-1:1 + +== buffer | example-file | wrong-encoding | default == + +Errors: + main-iso-8859-1.txt:1:3: Could not decode source as "utf-8" + main-iso-8859-1.txt:1:1: Expected 'example', got Termination + +Root node children: 0 +First token: FOO_TERMINATION at 1:1-1:1 +Last token: FOO_TERMINATION at 1:1-1:1 + +Done diff --git a/testsuite/tests/misc/unicode/test.py b/testsuite/tests/misc/unicode/test.py new file mode 100644 index 000000000..a3d24fe62 --- /dev/null +++ b/testsuite/tests/misc/unicode/test.py @@ -0,0 +1,25 @@ +""" +Check that the handling of Unicode for various parsing settings (get from +file/buffer, encoding, file reader, ...) works correctly. +""" + +from langkit.dsl import ASTNode, Field, T + +from utils import build_and_run + + +class FooNode(ASTNode): + pass + + +class Example(FooNode): + f = Field(type=T.StrLit) + + +class StrLit(FooNode): + token_node = True + + +build_and_run(lkt_file="expected_concrete_syntax.lkt", gpr_mains=["main.adb"]) + +print("Done") diff --git a/testsuite/tests/misc/unicode/test.yaml b/testsuite/tests/misc/unicode/test.yaml new file mode 100644 index 000000000..30423a038 --- /dev/null +++ b/testsuite/tests/misc/unicode/test.yaml @@ -0,0 +1 @@ +driver: python From 4048175e62bcb031bf2dfef7280e21ce092ac46a Mon Sep 17 00:00:00 2001 From: Pierre-Marie de Rodat Date: Wed, 28 Jun 2023 11:48:24 +0000 Subject: [PATCH 2/2] grammar/case_rule: extend the testcase to check non-ASCII tokens The logic of case/match lexing rules may be complex when working on source buffers encoded using varying length charsets such as UTF-8. Extend this testcase so that the "backwards codepoint lookup" behavior is exercised with a multi-bytes codepoint. --- .../tests/grammar/case_rule/expected_concrete_syntax.lkt | 2 +- testsuite/tests/grammar/case_rule/main.py | 1 + testsuite/tests/grammar/case_rule/test.out | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt b/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt index 3661a65a9..13a512eb7 100644 --- a/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt +++ b/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt @@ -2,7 +2,7 @@ lexer foo_lexer { char dot <- "." - id <- p"[a-zA-Z]+" + id <- p"[a-zA-Zé🙂]+" tick <- "'" newline <- p"\n" diff --git a/testsuite/tests/grammar/case_rule/main.py b/testsuite/tests/grammar/case_rule/main.py index 91535a764..2e1c208d2 100644 --- a/testsuite/tests/grammar/case_rule/main.py +++ b/testsuite/tests/grammar/case_rule/main.py @@ -10,6 +10,7 @@ ('simple-attr', "a'b"), ('char-dot', "'a'.b"), ('id-char', "a'b'"), + ('unicode-id-char', "\xe9'\U0001f642'"), ): print('== {} =='.format(label)) u = ctx.get_from_buffer('{}.txt'.format(label), text) diff --git a/testsuite/tests/grammar/case_rule/test.out b/testsuite/tests/grammar/case_rule/test.out index b9287353b..76ce4c618 100644 --- a/testsuite/tests/grammar/case_rule/test.out +++ b/testsuite/tests/grammar/case_rule/test.out @@ -24,5 +24,14 @@ main.py: Running... +== unicode-id-char == +1:5-1:5: Expected Id, got Termination +-- + + + + + + main.py: Done. Done