From aa9a22291468e0a1912a6519e8752f29f0ee0b5c Mon Sep 17 00:00:00 2001 From: Vadim Godunko Date: Tue, 12 Mar 2024 12:27:42 +0400 Subject: [PATCH 1/6] Use Forward_Element to improve performance. --- ...strings-cursors-iterators-grapheme_clusters.adb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb b/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb index 920f1d4d..571e15f5 100644 --- a/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb +++ b/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb @@ -411,6 +411,7 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is Left : VSS.Implementation.Strings.Cursor; Left_Properties : VSS.Implementation.UCD_Core.Core_Data_Record; Right : VSS.Implementation.Strings.Cursor; + Right_Code : VSS.Unicode.Code_Point'Base; Right_Properties : VSS.Implementation.UCD_Core.Core_Data_Record; Success : Boolean; Done : Boolean := False; @@ -426,7 +427,8 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is Handler := VSS.Implementation.Strings.Handler (Data); Self.First_Position := Self.Last_Position; - Success := Handler.Forward (Data, Self.First_Position); + Success := + Handler.Forward_Element (Data, Self.First_Position, Right_Code); if not Success then -- End of the string has been reached. @@ -435,15 +437,14 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is return False; else - Right := Self.First_Position; - Right_Properties := - Extract_Core_Data (Handler.Element (Data, Right)); + Right := Self.First_Position; + Right_Properties := Extract_Core_Data (Right_Code); loop Left := Right; Left_Properties := Right_Properties; - Success := Handler.Forward (Data, Right); + Success := Handler.Forward_Element (Data, Right, Right_Code); if not Success then -- End of line has been reached @@ -454,8 +455,7 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is return True; else - Right_Properties := - Extract_Core_Data (Handler.Element (Data, Right)); + Right_Properties := Extract_Core_Data (Right_Code); if Left_Properties.GCB = GCB_CR and Right_Properties.GCB = GCB_LF From 025a08c47786c79fcf94cf583c61e2948d08dead Mon Sep 17 00:00:00 2001 From: Vadim Godunko Date: Tue, 12 Mar 2024 14:08:12 +0400 Subject: [PATCH 2/6] Improve algorithm to compute length of the character in UTF-8 and UTF-16. --- ...ss-implementation-utf8_string_handlers.adb | 58 ++++++++----------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/source/text/implementation/vss-implementation-utf8_string_handlers.adb b/source/text/implementation/vss-implementation-utf8_string_handlers.adb index 68666349..7df99250 100644 --- a/source/text/implementation/vss-implementation-utf8_string_handlers.adb +++ b/source/text/implementation/vss-implementation-utf8_string_handlers.adb @@ -7,7 +7,9 @@ -- Generic implementation of the string which use UTF-8 encoding for data. with Ada.Unchecked_Deallocation; +with Interfaces; +with VSS.Implementation.GCC; with VSS.Implementation.Line_Iterators; with VSS.Implementation.String_Configuration; @@ -1832,44 +1834,34 @@ package body VSS.Implementation.UTF8_String_Handlers is end if; declare - Code : constant VSS.Unicode.UTF8_Code_Unit := - Storage (Position.UTF8_Offset); - - begin - case Code is - when 16#00# .. 16#7F# => - Position.UTF8_Offset := Position.UTF8_Offset + 1; - Position.UTF16_Offset := Position.UTF16_Offset + 1; + use type Interfaces.Integer_32; + use type VSS.Unicode.UTF8_Code_Unit; - when 16#C2# .. 16#DF# => - Position.UTF8_Offset := Position.UTF8_Offset + 2; - Position.UTF16_Offset := Position.UTF16_Offset + 1; + -- This code is based on the fact that starting byte of the + -- multibyte sequence in UTF-8 has N most significant bits set to + -- one followed by zero bit. So, first byte is negated and number + -- of leading zero bits is counting. - when 16#E0# .. 16#EF# => - Position.UTF8_Offset := Position.UTF8_Offset + 3; - Position.UTF16_Offset := Position.UTF16_Offset + 1; + Code : constant VSS.Unicode.UTF8_Code_Unit := + Storage (Position.UTF8_Offset); + Length : constant Interfaces.Integer_32 := + VSS.Implementation.GCC.clz (Interfaces.Unsigned_32 (not Code)) + - 24; - when 16#F0# .. 16#F4# => - Position.UTF8_Offset := Position.UTF8_Offset + 4; - Position.UTF16_Offset := Position.UTF16_Offset + 2; + begin + if Code <= 16#7F# then + Position.UTF8_Offset := Position.UTF8_Offset + 1; + Position.UTF16_Offset := Position.UTF16_Offset + 1; - when others => - raise Program_Error with "string data is corrupted"; - end case; + else + Position.UTF8_Offset := + Position.UTF8_Offset + + VSS.Unicode.UTF8_Code_Unit_Offset (Length); + Position.UTF16_Offset := + Position.UTF16_Offset + + VSS.Unicode.UTF16_Code_Unit_Offset (Length / 4 + 1); + end if; end; - - -- XXX case statement above may be rewritten as below to avoid - -- use of branch instructions. - -- - -- Position.UTF8_Offset := - -- Position.UTF8_Offset + 1 - -- + (if (Code and 2#1000_0000#) = 2#1000_0000# then 1 else 0) - -- + (if (Code and 2#1110_0000#) = 2#1110_0000# then 1 else 0) - -- + (if (Code and 2#1111_0000#) = 2#1111_0000# then 1 else 0); - -- - -- Position.UTF16_Offset := - -- Position.UTF16_Offset + 1 - -- + (if (Code and 2#1111_0000#) = 2#1111_0000# then 1 else 0); end Unchecked_Forward; ----------------- From 3f05e95b9803efe3a182f41b6564d41fc4524415 Mon Sep 17 00:00:00 2001 From: Vadim Godunko Date: Tue, 12 Mar 2024 16:02:18 +0400 Subject: [PATCH 3/6] Fix typo. --- data/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/README.md b/data/README.md index 7ac058b0..8948de72 100644 --- a/data/README.md +++ b/data/README.md @@ -17,7 +17,7 @@ curl -o emoji/emoji-test.txt https://www.unicode.org/Public/emoji/15.1/emoji-tes curl -o emoji/emoji-zwj-sequences.txt https://www.unicode.org/Public/emoji/15.1/emoji-zwj-sequences.txt git clone https://github.com/nigeltao/parse-number-fxx-test-data rm -rf parse-number-fxx-test-data/.git -git close https://github.com/json5/json5-tests.git +git clone https://github.com/json5/json5-tests.git rm -rf json5-tests/.git curl -O https://raw.githubusercontent.com/Perl/perl5/blead/t/re/re_tests tar caf ../vss-tests-data.tar.bz2 . From cebfedb9c3e478ebc7046a8204803a7da4ecdf40 Mon Sep 17 00:00:00 2001 From: Vadim Godunko Date: Tue, 12 Mar 2024 16:04:23 +0400 Subject: [PATCH 4/6] Remove unnecessary nesting levels. --- ...gs-cursors-iterators-grapheme_clusters.adb | 158 +++++++++--------- 1 file changed, 78 insertions(+), 80 deletions(-) diff --git a/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb b/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb index 571e15f5..fce393e7 100644 --- a/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb +++ b/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb @@ -435,120 +435,118 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is -- XXX Should Last_Position be set to After_Last_Character? return False; + end if; - else - Right := Self.First_Position; - Right_Properties := Extract_Core_Data (Right_Code); + Right := Self.First_Position; + Right_Properties := Extract_Core_Data (Right_Code); - loop - Left := Right; - Left_Properties := Right_Properties; + loop + Left := Right; + Left_Properties := Right_Properties; - Success := Handler.Forward_Element (Data, Right, Right_Code); + Success := Handler.Forward_Element (Data, Right, Right_Code); - if not Success then - -- End of line has been reached - -- Rule GB2 + if not Success then + -- End of line has been reached + -- Rule GB2 - Self.Last_Position := Left; + Self.Last_Position := Left; - return True; + return True; + end if; - else - Right_Properties := Extract_Core_Data (Right_Code); + Right_Properties := Extract_Core_Data (Right_Code); - if Left_Properties.GCB = GCB_CR - and Right_Properties.GCB = GCB_LF - then - -- Rule GB3 + if Left_Properties.GCB = GCB_CR + and Right_Properties.GCB = GCB_LF + then + -- Rule GB3 - null; + null; - elsif Left_Properties.GCB in GCB_CN | GCB_CR | GCB_LF then - -- Rule GB4 + elsif Left_Properties.GCB in GCB_CN | GCB_CR | GCB_LF then + -- Rule GB4 - Done := True; + Done := True; - elsif Right_Properties.GCB in GCB_CN | GCB_CR | GCB_LF then - -- Rule GB5 + elsif Right_Properties.GCB in GCB_CN | GCB_CR | GCB_LF then + -- Rule GB5 - Done := True; + Done := True; - elsif Left_Properties.GCB = GCB_L - and then Right_Properties.GCB - in GCB_L | GCB_V | GCB_LV | GCB_LVT - then - -- Rule GB6 + elsif Left_Properties.GCB = GCB_L + and then Right_Properties.GCB + in GCB_L | GCB_V | GCB_LV | GCB_LVT + then + -- Rule GB6 - null; + null; - elsif Left_Properties.GCB in GCB_LV | GCB_V - and then Right_Properties.GCB in GCB_V | GCB_T - then - -- Rule GB7 + elsif Left_Properties.GCB in GCB_LV | GCB_V + and then Right_Properties.GCB in GCB_V | GCB_T + then + -- Rule GB7 - null; + null; - elsif Left_Properties.GCB in GCB_LVT | GCB_T - and then Right_Properties.GCB = GCB_T - then - -- Rule GB8 + elsif Left_Properties.GCB in GCB_LVT | GCB_T + and then Right_Properties.GCB = GCB_T + then + -- Rule GB8 - null; + null; - elsif Right_Properties.GCB in GCB_EX | GCB_ZWJ then - -- Rule GB9 + elsif Right_Properties.GCB in GCB_EX | GCB_ZWJ then + -- Rule GB9 - null; + null; - elsif Right_Properties.GCB = GCB_SM then - -- Rule GB9a + elsif Right_Properties.GCB = GCB_SM then + -- Rule GB9a - null; + null; - elsif Left_Properties.GCB = GCB_PP then - -- Rule GB9b + elsif Left_Properties.GCB = GCB_PP then + -- Rule GB9b - null; + null; - elsif Left_Properties.InCB in INCB_Linker | INCB_Extend - and then Right_Properties.InCB = INCB_Consonant - and then Apply_InCB - (Handler, Data, Left, Left_Properties.InCB = INCB_Linker) - then - -- Rule GB9c. + elsif Left_Properties.InCB in INCB_Linker | INCB_Extend + and then Right_Properties.InCB = INCB_Consonant + and then Apply_InCB + (Handler, Data, Left, Left_Properties.InCB = INCB_Linker) + then + -- Rule GB9c. - null; + null; - elsif Left_Properties.GCB = GCB_ZWJ - and then Right_Properties.ExtPict - and then Apply_ExtPict (Handler, Data, Left) - then - -- Rule GB11. + elsif Left_Properties.GCB = GCB_ZWJ + and then Right_Properties.ExtPict + and then Apply_ExtPict (Handler, Data, Left) + then + -- Rule GB11. - null; + null; - elsif Left_Properties.GCB = GCB_RI - and then Right_Properties.GCB = GCB_RI - and then Apply_RI (Handler, Data, Left) - then - -- Rule GB12. - -- Rule GB13. + elsif Left_Properties.GCB = GCB_RI + and then Right_Properties.GCB = GCB_RI + and then Apply_RI (Handler, Data, Left) + then + -- Rule GB12. + -- Rule GB13. - null; + null; - else - Done := True; - end if; + else + Done := True; + end if; - if Done then - Self.Last_Position := Left; + if Done then + Self.Last_Position := Left; - return True; - end if; - end if; - end loop; - end if; + return True; + end if; + end loop; end Forward; ----------------- From fa69a6f2942e9af936e8ae3a28200e5ef5c2022a Mon Sep 17 00:00:00 2001 From: Vadim Godunko Date: Tue, 12 Mar 2024 16:59:00 +0400 Subject: [PATCH 5/6] Use state table to speedup iteration. --- ...gs-cursors-iterators-grapheme_clusters.adb | 159 +++++++++--------- 1 file changed, 84 insertions(+), 75 deletions(-) diff --git a/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb b/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb index fce393e7..513bf5ee 100644 --- a/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb +++ b/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb @@ -50,6 +50,60 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is Is_Linker : Boolean) return Boolean; -- Scan string backward to check whether Rule GB9c should be applied. + type GCB_Action is (Break, No_Break, Unspecified); + + -- Table below encodes segmentation rules that depends from the value of + -- the GCB property only. + + Forward_GCB_Rules : constant array + (VSS.Implementation.UCD_Core.GCB_Values, + VSS.Implementation.UCD_Core.GCB_Values) of GCB_Action := + (GCB_CN => (others => Break), -- Rule GB4 + GCB_CR => + (GCB_LF => No_Break, -- Rule GB3 + others => Break), -- Rule GB4 + GCB_L => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_L | GCB_V | GCB_LV | GCB_LVT => No_Break, -- Rule GB6 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => Unspecified), + GCB_LF => (others => Break), -- Rule GB4 + GCB_LV => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_V | GCB_T => No_Break, -- Rule GB7 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => Unspecified), + GCB_LVT => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_T => No_Break, -- Rule GB8 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => Unspecified), + GCB_PP => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => No_Break), -- Rule GB9b + GCB_T => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_T => No_Break, -- Rule GB8 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => Unspecified), + GCB_V => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_V | GCB_T => No_Break, -- Rule GB7 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => Unspecified), + GCB_EX | GCB_RI | GCB_SM | GCB_XX | GCB_ZWJ => + (GCB_CN | GCB_CR | GCB_LF => Break, -- Rule GB5 + GCB_EX | GCB_ZWJ => No_Break, -- Rule GB9 + GCB_SM => No_Break, -- Rule GB9a + others => Unspecified)); + ------------------- -- Apply_ExtPict -- ------------------- @@ -457,89 +511,44 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is Right_Properties := Extract_Core_Data (Right_Code); - if Left_Properties.GCB = GCB_CR - and Right_Properties.GCB = GCB_LF - then - -- Rule GB3 - - null; - - elsif Left_Properties.GCB in GCB_CN | GCB_CR | GCB_LF then - -- Rule GB4 - - Done := True; - - elsif Right_Properties.GCB in GCB_CN | GCB_CR | GCB_LF then - -- Rule GB5 - - Done := True; + case Forward_GCB_Rules (Left_Properties.GCB, Right_Properties.GCB) is + when Break => + Done := True; - elsif Left_Properties.GCB = GCB_L - and then Right_Properties.GCB - in GCB_L | GCB_V | GCB_LV | GCB_LVT - then - -- Rule GB6 - - null; - - elsif Left_Properties.GCB in GCB_LV | GCB_V - and then Right_Properties.GCB in GCB_V | GCB_T - then - -- Rule GB7 - - null; - - elsif Left_Properties.GCB in GCB_LVT | GCB_T - and then Right_Properties.GCB = GCB_T - then - -- Rule GB8 - - null; - - elsif Right_Properties.GCB in GCB_EX | GCB_ZWJ then - -- Rule GB9 - - null; - - elsif Right_Properties.GCB = GCB_SM then - -- Rule GB9a - - null; - - elsif Left_Properties.GCB = GCB_PP then - -- Rule GB9b - - null; + when No_Break => + null; - elsif Left_Properties.InCB in INCB_Linker | INCB_Extend - and then Right_Properties.InCB = INCB_Consonant - and then Apply_InCB - (Handler, Data, Left, Left_Properties.InCB = INCB_Linker) - then - -- Rule GB9c. + when Unspecified => + if Left_Properties.InCB in INCB_Linker | INCB_Extend + and then Right_Properties.InCB = INCB_Consonant + and then Apply_InCB + (Handler, Data, Left, Left_Properties.InCB = INCB_Linker) + then + -- Rule GB9c. - null; + null; - elsif Left_Properties.GCB = GCB_ZWJ - and then Right_Properties.ExtPict - and then Apply_ExtPict (Handler, Data, Left) - then - -- Rule GB11. + elsif Left_Properties.GCB = GCB_ZWJ + and then Right_Properties.ExtPict + and then Apply_ExtPict (Handler, Data, Left) + then + -- Rule GB11. - null; + null; - elsif Left_Properties.GCB = GCB_RI - and then Right_Properties.GCB = GCB_RI - and then Apply_RI (Handler, Data, Left) - then - -- Rule GB12. - -- Rule GB13. + elsif Left_Properties.GCB = GCB_RI + and then Right_Properties.GCB = GCB_RI + and then Apply_RI (Handler, Data, Left) + then + -- Rule GB12. + -- Rule GB13. - null; + null; - else - Done := True; - end if; + else + Done := True; + end if; + end case; if Done then Self.Last_Position := Left; From 7855b6d9f672fa6e54d0b168af4244a38686798b Mon Sep 17 00:00:00 2001 From: Max Reznik Date: Tue, 12 Mar 2024 18:11:57 +0000 Subject: [PATCH 6/6] Apply 1 suggestion(s) to 1 file(s) --- .../vss-strings-cursors-iterators-grapheme_clusters.adb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb b/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb index 513bf5ee..cc7d029c 100644 --- a/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb +++ b/source/text/implementation/vss-strings-cursors-iterators-grapheme_clusters.adb @@ -52,8 +52,8 @@ package body VSS.Strings.Cursors.Iterators.Grapheme_Clusters is type GCB_Action is (Break, No_Break, Unspecified); - -- Table below encodes segmentation rules that depends from the value of - -- the GCB property only. + -- The table below encodes segmentation rules that depend only on the + -- value of the GCB property. Forward_GCB_Rules : constant array (VSS.Implementation.UCD_Core.GCB_Values,