Skip to content

Commit

Permalink
Merge branch 'mr/pmderodat/utf8-preparatory' into 'master'
Browse files Browse the repository at this point in the history
Preparatory work for the transition of source buffers to UTF-8

See merge request eng/libadalang/langkit!1022
  • Loading branch information
pmderodat committed Mar 13, 2024
2 parents 6f53bc0 + 4048175 commit 2957667
Show file tree
Hide file tree
Showing 12 changed files with 354 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ lexer foo_lexer {

char
dot <- "."
id <- p"[a-zA-Z]+"
id <- p"[a-zA-Zé🙂]+"
tick <- "'"
newline <- p"\n"

Expand Down
1 change: 1 addition & 0 deletions testsuite/tests/grammar/case_rule/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
('simple-attr', "a'b"),
('char-dot', "'a'.b"),
('id-char', "a'b'"),
('unicode-id-char', "\xe9'\U0001f642'"),
):
print('== {} =='.format(label))
u = ctx.get_from_buffer('{}.txt'.format(label), text)
Expand Down
9 changes: 9 additions & 0 deletions testsuite/tests/grammar/case_rule/test.out
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,14 @@ main.py: Running...
<Token Tick "'" at 1:4-1:5>
<Token Termination at 1:5-1:5>

== unicode-id-char ==
1:5-1:5: Expected Id, got Termination
--
<Token Id 'é' at 1:1-1:2>
<Token Tick "'" at 1:2-1:3>
<Token Id '🙂' at 1:3-1:4>
<Token Tick "'" at 1:4-1:5>
<Token Termination at 1:5-1:5>

main.py: Done.
Done
Empty file.
16 changes: 16 additions & 0 deletions testsuite/tests/misc/unicode/expected_concrete_syntax.lkt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import lexer_example

@with_lexer(foo_lexer)
grammar foo_grammar {
@main_rule main_rule <- list+(Example(@example StrLit(@string)))
}

@abstract class FooNode implements Node[FooNode] {
}

class Example: FooNode {
@parse_field f: StrLit
}

class StrLit: FooNode implements TokenNode {
}
43 changes: 43 additions & 0 deletions testsuite/tests/misc/unicode/main-iso-8859-1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# ��������������������������������������������������������������������������� #
# ��������������������������������������������������������������������������� #

example "1�"

# ��������������������������������������������������������������������������� #

example "1�2�"

# ��������������������������������������������������������������������������� #

example "1�2�3�"

# ��������������������������������������������������������������������������� #

example "1�2�3�4�"

# ��������������������������������������������������������������������������� #

example "1�2�3�4�5�"

# ��������������������������������������������������������������������������� #

example "1�2�3�4�5�6�"

# ��������������������������������������������������������������������������� #

example "1�2�3�4�5�6�7�"

# ��������������������������������������������������������������������������� #

example "1�2�3�4�5�6�7�8�"

# ��������������������������������������������������������������������������� #

example "1�2�3�4�5�6�7�8�9�"

# ��������������������������������������������������������������������������� #

example "1�2�3�4�5�6�7�8�9�0�"

# ��������������������������������������������������������������������������� #
# ��������������������������������������������������������������������������� #
131 changes: 131 additions & 0 deletions testsuite/tests/misc/unicode/main.adb
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
with Ada.Text_IO; use Ada.Text_IO;

with GNAT.Strings; use GNAT.Strings;
with GNATCOLL.Mmap; use GNATCOLL.Mmap;

with Langkit_Support.File_Readers; use Langkit_Support.File_Readers;
with Langkit_Support.Slocs; use Langkit_Support.Slocs;
with Libfoolang.Analysis; use Libfoolang.Analysis;
with Libfoolang.Common; use Libfoolang.Common;

with Support; use Support;

procedure Main is

Empty_File : constant String := "empty.txt";
Empty_Buffer : aliased constant String := "";

Example_File : constant String := "main-iso-8859-1.txt";
Example_Buffer : String_Access := Read_Whole_File (Example_File);

procedure Check
(From_Buffer : Boolean := False;
Empty_File : Boolean := False;
Wrong_Encoding : Boolean := False;
With_File_Reader : Boolean := False);

-----------
-- Check --
-----------

procedure Check
(From_Buffer : Boolean := False;
Empty_File : Boolean := False;
Wrong_Encoding : Boolean := False;
With_File_Reader : Boolean := False)
is
Charset : constant String :=
(if Wrong_Encoding then "utf-8" else "iso-8859-1");
Filename : constant String :=
(if Empty_File then Main.Empty_File else Example_File);
Buffer : constant access constant String :=
(if Empty_File then Empty_Buffer'Access else Example_Buffer);

Ctx : Analysis_Context;
U : Analysis_Unit;
begin
-- Put some label for this check

Put ("== ");
Put (if From_Buffer then "buffer" else "file");
Put (" | ");
Put (if Empty_File then "empty-file" else "example-file");
Put (" | ");
Put (if Wrong_Encoding then "wrong-encoding" else "correct-encoding");
Put (" | ");
Put (if With_File_Reader then "file-reader" else "default");
Put_Line (" ==");
New_Line;

-- Parse the source according to requested settings

Ctx := Create_Context
(File_Reader => (if With_File_Reader
then Get_File_Reader
else No_File_Reader_Reference));
if From_Buffer then
U := Ctx.Get_From_Buffer
(Filename => Filename,
Charset => Charset,
Buffer => Buffer.all);
else
U := Ctx.Get_From_File
(Filename => Filename, Charset => Charset);
end if;

-- Display parsing errors, if any

if U.Has_Diagnostics then
Put_Line ("Errors:");
for D of U.Diagnostics loop
Put_Line (" " & U.Format_GNU_Diagnostic (D));
end loop;
New_Line;
end if;

-- Summarize the content of the parsed unit

if U.Root.Is_Null then
Put_Line ("No root node");
else
Put_Line ("Root node children:" & U.Root.Children_Count'Image);
declare
D : constant Token_Data_Type := Data (U.First_Token);
begin
Put_Line
("First token: "
& Kind (D)'Image
& " at " & Image (Sloc_Range (D)));
end;
declare
D : constant Token_Data_Type := Data (U.Last_Token);
begin
Put_Line
("Last token: "
& Kind (D)'Image
& " at " & Image (Sloc_Range (D)));
end;
end if;
New_Line;
end Check;

begin
-- Get_From_File

Check;
Check (With_File_Reader => True);

Check (Empty_File => True);
Check (Empty_File => True, With_File_Reader => True);

Check (Wrong_Encoding => True);
Check (Wrong_Encoding => True, With_File_Reader => True);

-- Get_From_Buffer

Check (From_Buffer => True);
Check (From_Buffer => True, Empty_File => True);
Check (From_Buffer => True, Wrong_Encoding => True);

Free (Example_Buffer);
end Main;
46 changes: 46 additions & 0 deletions testsuite/tests/misc/unicode/support.adb
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
with Langkit_Support.Diagnostics; use Langkit_Support.Diagnostics;

package body Support is

type My_FR is new File_Reader_Interface with null record;

overriding procedure Read
(Self : My_FR;
Filename : String;
Charset : String;
Read_BOM : Boolean;
Contents : out Decoded_File_Contents;
Diagnostics : in out Diagnostics_Vectors.Vector);

overriding procedure Release (Self : in out My_FR) is null;

----------
-- Read --
----------

overriding procedure Read
(Self : My_FR;
Filename : String;
Charset : String;
Read_BOM : Boolean;
Contents : out Decoded_File_Contents;
Diagnostics : in out Diagnostics_Vectors.Vector)
is
begin
Direct_Read (Filename, Charset, Read_BOM, Contents, Diagnostics);
if Diagnostics.Is_Empty and then Contents.Buffer.all'Length > 79 then
Contents.Buffer.all (Contents.First .. Contents.First + 79) :=
(1 .. 80 => ' ');
end if;
end Read;

---------------------
-- Get_File_Reader --
---------------------

function Get_File_Reader return File_Reader_Reference is
begin
return Create_File_Reader_Reference (My_FR'(null record));
end Get_File_Reader;

end Support;
5 changes: 5 additions & 0 deletions testsuite/tests/misc/unicode/support.ads
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
with Langkit_Support.File_Readers; use Langkit_Support.File_Readers;

package Support is
function Get_File_Reader return File_Reader_Reference;
end Support;
76 changes: 76 additions & 0 deletions testsuite/tests/misc/unicode/test.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
== file | example-file | correct-encoding | default ==

Root node children: 10
First token: FOO_COMMENT at 1:1-1:80
Last token: FOO_TERMINATION at 44:1-44:1

== file | example-file | correct-encoding | file-reader ==

Root node children: 10
First token: FOO_WHITESPACE at 1:1-1:81
Last token: FOO_TERMINATION at 43:1-43:1

== file | empty-file | correct-encoding | default ==

Errors:
empty.txt:1:1: Expected 'example', got Termination

Root node children: 0
First token: FOO_TERMINATION at 1:1-1:1
Last token: FOO_TERMINATION at 1:1-1:1

== file | empty-file | correct-encoding | file-reader ==

Errors:
empty.txt:1:1: Expected 'example', got Termination

Root node children: 0
First token: FOO_TERMINATION at 1:1-1:1
Last token: FOO_TERMINATION at 1:1-1:1

== file | example-file | wrong-encoding | default ==

Errors:
main-iso-8859-1.txt:1:3: Could not decode source as "utf-8"
main-iso-8859-1.txt:1:1: Expected 'example', got Termination

Root node children: 0
First token: FOO_TERMINATION at 1:1-1:1
Last token: FOO_TERMINATION at 1:1-1:1

== file | example-file | wrong-encoding | file-reader ==

Errors:
main-iso-8859-1.txt:1:3: Could not decode source as "utf-8"
main-iso-8859-1.txt:1:1: Expected 'example', got Termination

Root node children: 0
First token: FOO_TERMINATION at 1:1-1:1
Last token: FOO_TERMINATION at 1:1-1:1

== buffer | example-file | correct-encoding | default ==

Root node children: 10
First token: FOO_COMMENT at 1:1-1:80
Last token: FOO_TERMINATION at 44:1-44:1

== buffer | empty-file | correct-encoding | default ==

Errors:
empty.txt:1:1: Expected 'example', got Termination

Root node children: 0
First token: FOO_TERMINATION at 1:1-1:1
Last token: FOO_TERMINATION at 1:1-1:1

== buffer | example-file | wrong-encoding | default ==

Errors:
main-iso-8859-1.txt:1:3: Could not decode source as "utf-8"
main-iso-8859-1.txt:1:1: Expected 'example', got Termination

Root node children: 0
First token: FOO_TERMINATION at 1:1-1:1
Last token: FOO_TERMINATION at 1:1-1:1

Done
25 changes: 25 additions & 0 deletions testsuite/tests/misc/unicode/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
Check that the handling of Unicode for various parsing settings (get from
file/buffer, encoding, file reader, ...) works correctly.
"""

from langkit.dsl import ASTNode, Field, T

from utils import build_and_run


class FooNode(ASTNode):
pass


class Example(FooNode):
f = Field(type=T.StrLit)


class StrLit(FooNode):
token_node = True


build_and_run(lkt_file="expected_concrete_syntax.lkt", gpr_mains=["main.adb"])

print("Done")
1 change: 1 addition & 0 deletions testsuite/tests/misc/unicode/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
driver: python

0 comments on commit 2957667

Please sign in to comment.