diff --git a/data/README.md b/data/README.md index 7ac058b0..8948de72 100644 --- a/data/README.md +++ b/data/README.md @@ -17,7 +17,7 @@ curl -o emoji/emoji-test.txt https://www.unicode.org/Public/emoji/15.1/emoji-tes curl -o emoji/emoji-zwj-sequences.txt https://www.unicode.org/Public/emoji/15.1/emoji-zwj-sequences.txt git clone https://github.com/nigeltao/parse-number-fxx-test-data rm -rf parse-number-fxx-test-data/.git -git close https://github.com/json5/json5-tests.git +git clone https://github.com/json5/json5-tests.git rm -rf json5-tests/.git curl -O https://raw.githubusercontent.com/Perl/perl5/blead/t/re/re_tests tar caf ../vss-tests-data.tar.bz2 . diff --git a/source/text/implementation/vss-implementation-utf8_string_handlers.adb b/source/text/implementation/vss-implementation-utf8_string_handlers.adb index 68666349..7df99250 100644 --- a/source/text/implementation/vss-implementation-utf8_string_handlers.adb +++ b/source/text/implementation/vss-implementation-utf8_string_handlers.adb @@ -7,7 +7,9 @@ -- Generic implementation of the string which use UTF-8 encoding for data. with Ada.Unchecked_Deallocation; +with Interfaces; +with VSS.Implementation.GCC; with VSS.Implementation.Line_Iterators; with VSS.Implementation.String_Configuration; @@ -1832,44 +1834,34 @@ package body VSS.Implementation.UTF8_String_Handlers is end if; declare - Code : constant VSS.Unicode.UTF8_Code_Unit := - Storage (Position.UTF8_Offset); - - begin - case Code is - when 16#00# .. 16#7F# => - Position.UTF8_Offset := Position.UTF8_Offset + 1; - Position.UTF16_Offset := Position.UTF16_Offset + 1; + use type Interfaces.Integer_32; + use type VSS.Unicode.UTF8_Code_Unit; - when 16#C2# .. 16#DF# => - Position.UTF8_Offset := Position.UTF8_Offset + 2; - Position.UTF16_Offset := Position.UTF16_Offset + 1; + -- This code is based on the fact that starting byte of the + -- multibyte sequence in UTF-8 has N most significant bits set to + -- one followed by zero bit. So, first byte is negated and number + -- of leading zero bits is counting. - when 16#E0# .. 16#EF# => - Position.UTF8_Offset := Position.UTF8_Offset + 3; - Position.UTF16_Offset := Position.UTF16_Offset + 1; + Code : constant VSS.Unicode.UTF8_Code_Unit := + Storage (Position.UTF8_Offset); + Length : constant Interfaces.Integer_32 := + VSS.Implementation.GCC.clz (Interfaces.Unsigned_32 (not Code)) + - 24; - when 16#F0# .. 16#F4# => - Position.UTF8_Offset := Position.UTF8_Offset + 4; - Position.UTF16_Offset := Position.UTF16_Offset + 2; + begin + if Code <= 16#7F# then + Position.UTF8_Offset := Position.UTF8_Offset + 1; + Position.UTF16_Offset := Position.UTF16_Offset + 1; - when others => - raise Program_Error with "string data is corrupted"; - end case; + else + Position.UTF8_Offset := + Position.UTF8_Offset + + VSS.Unicode.UTF8_Code_Unit_Offset (Length); + Position.UTF16_Offset := + Position.UTF16_Offset + + VSS.Unicode.UTF16_Code_Unit_Offset (Length / 4 + 1); + end if; end; - - -- XXX case statement above may be rewritten as below to avoid - -- use of branch instructions. - -- - -- Position.UTF8_Offset := - -- Position.UTF8_Offset + 1 - -- + (if (Code and 2#1000_0000#) = 2#1000_0000# then 1 else 0) - -- + (if (Code and 2#1110_0000#) = 2#1110_0000# then 1 else 0) - -- + (if (Code and 2#1111_0000#) = 2#1111_0000# then 1 else 0); - -- - -- Position.UTF16_Offset := - -- Position.UTF16_Offset + 1 - -- + (if (Code and 2#1111_0000#) = 2#1111_0000# then 1 else 0); end Unchecked_Forward; ----------------- diff --git a/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb b/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb index 920f1d4d..cc7d029c 100644 --- a/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb +++ b/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb @@ -50,6 +50,60 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is Is_Linker : Boolean) return Boolean; -- Scan string backward to check whether Rule GB9c should be applied. + type GCB_Action is (Break, No_Break, Unspecified); + + -- The table below encodes segmentation rules that depend only on the + -- value of the GCB property. + + Forward_GCB_Rules : constant array + (VSS.Implementation.UCD_Core.GCB_Values, + VSS.Implementation.UCD_Core.GCB_Values) of GCB_Action := + (GCB_CN => (others => Break), -- Rule GB4 + GCB_CR => + (GCB_LF => No_Break, -- Rule GB3 + others => Break), -- Rule GB4 + GCB_L => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_L | GCB_V | GCB_LV | GCB_LVT => No_Break, -- Rule GB6 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => Unspecified), + GCB_LF => (others => Break), -- Rule GB4 + GCB_LV => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_V | GCB_T => No_Break, -- Rule GB7 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => Unspecified), + GCB_LVT => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_T => No_Break, -- Rule GB8 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => Unspecified), + GCB_PP => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => No_Break), -- Rule GB9b + GCB_T => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_T => No_Break, -- Rule GB8 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => Unspecified), + GCB_V => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_V | GCB_T => No_Break, -- Rule GB7 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => Unspecified), + GCB_EX | GCB_RI | GCB_SM | GCB_XX | GCB_ZWJ => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => Unspecified)); + ------------------- -- Apply_ExtPict -- ------------------- @@ -411,6 +465,7 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is Left : VSS.Implementation.Strings.Cursor; Left_Properties : VSS.Implementation.UCD_Core.Core_Data_Record; Right : VSS.Implementation.Strings.Cursor; + Right_Code : VSS.Unicode.Code_Point'Base; Right_Properties : VSS.Implementation.UCD_Core.Core_Data_Record; Success : Boolean; Done : Boolean := False; @@ -426,92 +481,45 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is Handler := VSS.Implementation.Strings.Handler (Data); Self.First_Position := Self.Last_Position; - Success := Handler.Forward (Data, Self.First_Position); + Success := + Handler.Forward_Element (Data, Self.First_Position, Right_Code); if not Success then -- End of the string has been reached. -- XXX Should Last_Position be set to After_Last_Character? return False; + end if; - else - Right := Self.First_Position; - Right_Properties := - Extract_Core_Data (Handler.Element (Data, Right)); - - loop - Left := Right; - Left_Properties := Right_Properties; - - Success := Handler.Forward (Data, Right); - - if not Success then - -- End of line has been reached - -- Rule GB2 - - Self.Last_Position := Left; - - return True; - - else - Right_Properties := - Extract_Core_Data (Handler.Element (Data, Right)); - - if Left_Properties.GCB = GCB_CR - and Right_Properties.GCB = GCB_LF - then - -- Rule GB3 - - null; - - elsif Left_Properties.GCB in GCB_CN | GCB_CR | GCB_LF then - -- Rule GB4 - - Done := True; - - elsif Right_Properties.GCB in GCB_CN | GCB_CR | GCB_LF then - -- Rule GB5 - - Done := True; - - elsif Left_Properties.GCB = GCB_L - and then Right_Properties.GCB - in GCB_L | GCB_V | GCB_LV | GCB_LVT - then - -- Rule GB6 - - null; - - elsif Left_Properties.GCB in GCB_LV | GCB_V - and then Right_Properties.GCB in GCB_V | GCB_T - then - -- Rule GB7 - - null; + Right := Self.First_Position; + Right_Properties := Extract_Core_Data (Right_Code); - elsif Left_Properties.GCB in GCB_LVT | GCB_T - and then Right_Properties.GCB = GCB_T - then - -- Rule GB8 + loop + Left := Right; + Left_Properties := Right_Properties; - null; + Success := Handler.Forward_Element (Data, Right, Right_Code); - elsif Right_Properties.GCB in GCB_EX | GCB_ZWJ then - -- Rule GB9 + if not Success then + -- End of line has been reached + -- Rule GB2 - null; + Self.Last_Position := Left; - elsif Right_Properties.GCB = GCB_SM then - -- Rule GB9a + return True; + end if; - null; + Right_Properties := Extract_Core_Data (Right_Code); - elsif Left_Properties.GCB = GCB_PP then - -- Rule GB9b + case Forward_GCB_Rules (Left_Properties.GCB, Right_Properties.GCB) is + when Break => + Done := True; - null; + when No_Break => + null; - elsif Left_Properties.InCB in INCB_Linker | INCB_Extend + when Unspecified => + if Left_Properties.InCB in INCB_Linker | INCB_Extend and then Right_Properties.InCB = INCB_Consonant and then Apply_InCB (Handler, Data, Left, Left_Properties.InCB = INCB_Linker) @@ -540,15 +548,14 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is else Done := True; end if; + end case; - if Done then - Self.Last_Position := Left; + if Done then + Self.Last_Position := Left; - return True; - end if; - end if; - end loop; - end if; + return True; + end if; + end loop; end Forward; -----------------