From dbad162373a75a318a2336f1d3784d31443e4bbc Mon Sep 17 00:00:00 2001
From: Pierre-Marie de Rodat <derodat@adacore.com>
Date: Fri, 30 Jun 2023 13:56:36 +0000
Subject: [PATCH 1/2] misc/unicode: new testcase

---
 testsuite/tests/misc/unicode/empty.txt        |   0
 .../misc/unicode/expected_concrete_syntax.lkt |  16 +++
 .../tests/misc/unicode/main-iso-8859-1.txt    |  43 ++++++
 testsuite/tests/misc/unicode/main.adb         | 131 ++++++++++++++++++
 testsuite/tests/misc/unicode/support.adb      |  46 ++++++
 testsuite/tests/misc/unicode/support.ads      |   5 +
 testsuite/tests/misc/unicode/test.out         |  76 ++++++++++
 testsuite/tests/misc/unicode/test.py          |  25 ++++
 testsuite/tests/misc/unicode/test.yaml        |   1 +
 9 files changed, 343 insertions(+)
 create mode 100644 testsuite/tests/misc/unicode/empty.txt
 create mode 100644 testsuite/tests/misc/unicode/expected_concrete_syntax.lkt
 create mode 100644 testsuite/tests/misc/unicode/main-iso-8859-1.txt
 create mode 100644 testsuite/tests/misc/unicode/main.adb
 create mode 100644 testsuite/tests/misc/unicode/support.adb
 create mode 100644 testsuite/tests/misc/unicode/support.ads
 create mode 100644 testsuite/tests/misc/unicode/test.out
 create mode 100644 testsuite/tests/misc/unicode/test.py
 create mode 100644 testsuite/tests/misc/unicode/test.yaml

diff --git a/testsuite/tests/misc/unicode/empty.txt b/testsuite/tests/misc/unicode/empty.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/tests/misc/unicode/expected_concrete_syntax.lkt b/testsuite/tests/misc/unicode/expected_concrete_syntax.lkt
new file mode 100644
index 000000000..e2c4e931d
--- /dev/null
+++ b/testsuite/tests/misc/unicode/expected_concrete_syntax.lkt
@@ -0,0 +1,16 @@
+import lexer_example
+
+@with_lexer(foo_lexer)
+grammar foo_grammar {
+    @main_rule main_rule <- list+(Example(@example StrLit(@string)))
+}
+
+@abstract class FooNode implements Node[FooNode] {
+}
+
+class Example: FooNode {
+    @parse_field f: StrLit
+}
+
+class StrLit: FooNode implements TokenNode {
+}
diff --git a/testsuite/tests/misc/unicode/main-iso-8859-1.txt b/testsuite/tests/misc/unicode/main-iso-8859-1.txt
new file mode 100644
index 000000000..10ad003f6
--- /dev/null
+++ b/testsuite/tests/misc/unicode/main-iso-8859-1.txt
@@ -0,0 +1,43 @@
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
+
+example "1и"
+
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
+
+example "1и2и"
+
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
+
+example "1и2и3и"
+
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
+
+example "1и2и3и4и"
+
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
+
+example "1и2и3и4и5и"
+
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
+
+example "1и2и3и4и5и6и"
+
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
+
+example "1и2и3и4и5и6и7и"
+
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
+
+example "1и2и3и4и5и6и7и8и"
+
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
+
+example "1и2и3и4и5и6и7и8и9и"
+
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
+
+example "1и2и3и4и5и6и7и8и9и0и"
+
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
+# ИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИИ #
diff --git a/testsuite/tests/misc/unicode/main.adb b/testsuite/tests/misc/unicode/main.adb
new file mode 100644
index 000000000..acf907df2
--- /dev/null
+++ b/testsuite/tests/misc/unicode/main.adb
@@ -0,0 +1,131 @@
+with Ada.Text_IO; use Ada.Text_IO;
+
+with GNAT.Strings;  use GNAT.Strings;
+with GNATCOLL.Mmap; use GNATCOLL.Mmap;
+
+with Langkit_Support.File_Readers; use Langkit_Support.File_Readers;
+with Langkit_Support.Slocs;        use Langkit_Support.Slocs;
+with Libfoolang.Analysis;          use Libfoolang.Analysis;
+with Libfoolang.Common;            use Libfoolang.Common;
+
+with Support; use Support;
+
+procedure Main is
+
+   Empty_File     : constant String := "empty.txt";
+   Empty_Buffer   : aliased constant String := "";
+
+   Example_File   : constant String := "main-iso-8859-1.txt";
+   Example_Buffer : String_Access := Read_Whole_File (Example_File);
+
+   procedure Check
+     (From_Buffer      : Boolean := False;
+      Empty_File       : Boolean := False;
+      Wrong_Encoding   : Boolean := False;
+      With_File_Reader : Boolean := False);
+
+   -----------
+   -- Check --
+   -----------
+
+   procedure Check
+     (From_Buffer      : Boolean := False;
+      Empty_File       : Boolean := False;
+      Wrong_Encoding   : Boolean := False;
+      With_File_Reader : Boolean := False)
+   is
+      Charset  : constant String :=
+        (if Wrong_Encoding then "utf-8" else "iso-8859-1");
+      Filename : constant String :=
+        (if Empty_File then Main.Empty_File else Example_File);
+      Buffer   : constant access constant String :=
+        (if Empty_File then Empty_Buffer'Access else Example_Buffer);
+
+      Ctx : Analysis_Context;
+      U   : Analysis_Unit;
+   begin
+      --  Put some label for this check
+
+      Put ("== ");
+      Put (if From_Buffer then "buffer" else "file");
+      Put (" | ");
+      Put (if Empty_File then "empty-file" else "example-file");
+      Put (" | ");
+      Put (if Wrong_Encoding then "wrong-encoding" else "correct-encoding");
+      Put (" | ");
+      Put (if With_File_Reader then "file-reader" else "default");
+      Put_Line (" ==");
+      New_Line;
+
+      --  Parse the source according to requested settings
+
+      Ctx := Create_Context
+        (File_Reader => (if With_File_Reader
+                         then Get_File_Reader
+                         else No_File_Reader_Reference));
+      if From_Buffer then
+         U := Ctx.Get_From_Buffer
+           (Filename => Filename,
+            Charset  => Charset,
+            Buffer   => Buffer.all);
+      else
+         U := Ctx.Get_From_File
+           (Filename => Filename, Charset => Charset);
+      end if;
+
+      --  Display parsing errors, if any
+
+      if U.Has_Diagnostics then
+         Put_Line ("Errors:");
+         for D of U.Diagnostics loop
+            Put_Line ("  " & U.Format_GNU_Diagnostic (D));
+         end loop;
+         New_Line;
+      end if;
+
+      --  Summarize the content of the parsed unit
+
+      if U.Root.Is_Null then
+         Put_Line ("No root node");
+      else
+         Put_Line ("Root node children:" & U.Root.Children_Count'Image);
+         declare
+            D : constant Token_Data_Type := Data (U.First_Token);
+         begin
+            Put_Line
+              ("First token: "
+               & Kind (D)'Image
+               & " at " & Image (Sloc_Range (D)));
+         end;
+         declare
+            D : constant Token_Data_Type := Data (U.Last_Token);
+         begin
+            Put_Line
+              ("Last token:  "
+               & Kind (D)'Image
+               & " at " & Image (Sloc_Range (D)));
+         end;
+      end if;
+      New_Line;
+   end Check;
+
+begin
+   --  Get_From_File
+
+   Check;
+   Check (With_File_Reader => True);
+
+   Check (Empty_File => True);
+   Check (Empty_File => True, With_File_Reader => True);
+
+   Check (Wrong_Encoding => True);
+   Check (Wrong_Encoding => True, With_File_Reader => True);
+
+   --  Get_From_Buffer
+
+   Check (From_Buffer => True);
+   Check (From_Buffer => True, Empty_File => True);
+   Check (From_Buffer => True, Wrong_Encoding => True);
+
+   Free (Example_Buffer);
+end Main;
diff --git a/testsuite/tests/misc/unicode/support.adb b/testsuite/tests/misc/unicode/support.adb
new file mode 100644
index 000000000..b0387cb3e
--- /dev/null
+++ b/testsuite/tests/misc/unicode/support.adb
@@ -0,0 +1,46 @@
+with Langkit_Support.Diagnostics; use Langkit_Support.Diagnostics;
+
+package body Support is
+
+   type My_FR is new File_Reader_Interface with null record;
+
+   overriding procedure Read
+     (Self        : My_FR;
+      Filename    : String;
+      Charset     : String;
+      Read_BOM    : Boolean;
+      Contents    : out Decoded_File_Contents;
+      Diagnostics : in out Diagnostics_Vectors.Vector);
+
+   overriding procedure Release (Self : in out My_FR) is null;
+
+   ----------
+   -- Read --
+   ----------
+
+   overriding procedure Read
+     (Self        : My_FR;
+      Filename    : String;
+      Charset     : String;
+      Read_BOM    : Boolean;
+      Contents    : out Decoded_File_Contents;
+      Diagnostics : in out Diagnostics_Vectors.Vector)
+   is
+   begin
+      Direct_Read (Filename, Charset, Read_BOM, Contents, Diagnostics);
+      if Diagnostics.Is_Empty and then Contents.Buffer.all'Length > 79 then
+         Contents.Buffer.all (Contents.First .. Contents.First + 79) :=
+           (1 .. 80 => ' ');
+      end if;
+   end Read;
+
+   ---------------------
+   -- Get_File_Reader --
+   ---------------------
+
+   function Get_File_Reader return File_Reader_Reference is
+   begin
+      return Create_File_Reader_Reference (My_FR'(null record));
+   end Get_File_Reader;
+
+end Support;
diff --git a/testsuite/tests/misc/unicode/support.ads b/testsuite/tests/misc/unicode/support.ads
new file mode 100644
index 000000000..58088ac32
--- /dev/null
+++ b/testsuite/tests/misc/unicode/support.ads
@@ -0,0 +1,5 @@
+with Langkit_Support.File_Readers; use Langkit_Support.File_Readers;
+
+package Support is
+   function Get_File_Reader return File_Reader_Reference;
+end Support;
diff --git a/testsuite/tests/misc/unicode/test.out b/testsuite/tests/misc/unicode/test.out
new file mode 100644
index 000000000..695ecd6de
--- /dev/null
+++ b/testsuite/tests/misc/unicode/test.out
@@ -0,0 +1,76 @@
+== file | example-file | correct-encoding | default ==
+
+Root node children: 10
+First token: FOO_COMMENT at 1:1-1:80
+Last token:  FOO_TERMINATION at 44:1-44:1
+
+== file | example-file | correct-encoding | file-reader ==
+
+Root node children: 10
+First token: FOO_WHITESPACE at 1:1-1:81
+Last token:  FOO_TERMINATION at 43:1-43:1
+
+== file | empty-file | correct-encoding | default ==
+
+Errors:
+  empty.txt:1:1: Expected 'example', got Termination
+
+Root node children: 0
+First token: FOO_TERMINATION at 1:1-1:1
+Last token:  FOO_TERMINATION at 1:1-1:1
+
+== file | empty-file | correct-encoding | file-reader ==
+
+Errors:
+  empty.txt:1:1: Expected 'example', got Termination
+
+Root node children: 0
+First token: FOO_TERMINATION at 1:1-1:1
+Last token:  FOO_TERMINATION at 1:1-1:1
+
+== file | example-file | wrong-encoding | default ==
+
+Errors:
+  main-iso-8859-1.txt:1:3: Could not decode source as "utf-8"
+  main-iso-8859-1.txt:1:1: Expected 'example', got Termination
+
+Root node children: 0
+First token: FOO_TERMINATION at 1:1-1:1
+Last token:  FOO_TERMINATION at 1:1-1:1
+
+== file | example-file | wrong-encoding | file-reader ==
+
+Errors:
+  main-iso-8859-1.txt:1:3: Could not decode source as "utf-8"
+  main-iso-8859-1.txt:1:1: Expected 'example', got Termination
+
+Root node children: 0
+First token: FOO_TERMINATION at 1:1-1:1
+Last token:  FOO_TERMINATION at 1:1-1:1
+
+== buffer | example-file | correct-encoding | default ==
+
+Root node children: 10
+First token: FOO_COMMENT at 1:1-1:80
+Last token:  FOO_TERMINATION at 44:1-44:1
+
+== buffer | empty-file | correct-encoding | default ==
+
+Errors:
+  empty.txt:1:1: Expected 'example', got Termination
+
+Root node children: 0
+First token: FOO_TERMINATION at 1:1-1:1
+Last token:  FOO_TERMINATION at 1:1-1:1
+
+== buffer | example-file | wrong-encoding | default ==
+
+Errors:
+  main-iso-8859-1.txt:1:3: Could not decode source as "utf-8"
+  main-iso-8859-1.txt:1:1: Expected 'example', got Termination
+
+Root node children: 0
+First token: FOO_TERMINATION at 1:1-1:1
+Last token:  FOO_TERMINATION at 1:1-1:1
+
+Done
diff --git a/testsuite/tests/misc/unicode/test.py b/testsuite/tests/misc/unicode/test.py
new file mode 100644
index 000000000..a3d24fe62
--- /dev/null
+++ b/testsuite/tests/misc/unicode/test.py
@@ -0,0 +1,25 @@
+"""
+Check that the handling of Unicode for various parsing settings (get from
+file/buffer, encoding, file reader, ...) works correctly.
+"""
+
+from langkit.dsl import ASTNode, Field, T
+
+from utils import build_and_run
+
+
+class FooNode(ASTNode):
+    pass
+
+
+class Example(FooNode):
+    f = Field(type=T.StrLit)
+
+
+class StrLit(FooNode):
+    token_node = True
+
+
+build_and_run(lkt_file="expected_concrete_syntax.lkt", gpr_mains=["main.adb"])
+
+print("Done")
diff --git a/testsuite/tests/misc/unicode/test.yaml b/testsuite/tests/misc/unicode/test.yaml
new file mode 100644
index 000000000..30423a038
--- /dev/null
+++ b/testsuite/tests/misc/unicode/test.yaml
@@ -0,0 +1 @@
+driver: python

From 4048175e62bcb031bf2dfef7280e21ce092ac46a Mon Sep 17 00:00:00 2001
From: Pierre-Marie de Rodat <derodat@adacore.com>
Date: Wed, 28 Jun 2023 11:48:24 +0000
Subject: [PATCH 2/2] grammar/case_rule: extend the testcase to check non-ASCII
 tokens

The logic of case/match lexing rules may be complex when working on
source buffers encoded using varying length charsets such as UTF-8.
Extend this testcase so that the "backwards codepoint lookup" behavior
is exercised with a multi-bytes codepoint.
---
 .../tests/grammar/case_rule/expected_concrete_syntax.lkt | 2 +-
 testsuite/tests/grammar/case_rule/main.py                | 1 +
 testsuite/tests/grammar/case_rule/test.out               | 9 +++++++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt b/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt
index 3661a65a9..13a512eb7 100644
--- a/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt
+++ b/testsuite/tests/grammar/case_rule/expected_concrete_syntax.lkt
@@ -2,7 +2,7 @@ lexer foo_lexer {
 
     char
     dot <- "."
-    id <- p"[a-zA-Z]+"
+    id <- p"[a-zA-Zц╘П÷≥┌]+"
     tick <- "'"
     newline <- p"\n"
 
diff --git a/testsuite/tests/grammar/case_rule/main.py b/testsuite/tests/grammar/case_rule/main.py
index 91535a764..2e1c208d2 100644
--- a/testsuite/tests/grammar/case_rule/main.py
+++ b/testsuite/tests/grammar/case_rule/main.py
@@ -10,6 +10,7 @@
     ('simple-attr', "a'b"),
     ('char-dot', "'a'.b"),
     ('id-char', "a'b'"),
+    ('unicode-id-char', "\xe9'\U0001f642'"),
 ):
     print('== {} =='.format(label))
     u = ctx.get_from_buffer('{}.txt'.format(label), text)
diff --git a/testsuite/tests/grammar/case_rule/test.out b/testsuite/tests/grammar/case_rule/test.out
index b9287353b..76ce4c618 100644
--- a/testsuite/tests/grammar/case_rule/test.out
+++ b/testsuite/tests/grammar/case_rule/test.out
@@ -24,5 +24,14 @@ main.py: Running...
 <Token Tick "'" at 1:4-1:5>
 <Token Termination at 1:5-1:5>
 
+== unicode-id-char ==
+1:5-1:5: Expected Id, got Termination
+--
+<Token Id 'ц╘' at 1:1-1:2>
+<Token Tick "'" at 1:2-1:3>
+<Token Id 'П÷≥┌' at 1:3-1:4>
+<Token Tick "'" at 1:4-1:5>
+<Token Termination at 1:5-1:5>
+
 main.py: Done.
 Done