Improve block lookup

composewell · Jun 15, 2022 · 9f0a284 · 9f0a284
1 parent f24893a
commit 9f0a284
Show file tree

Hide file tree

Showing 3 changed files with 472 additions and 747 deletions.
diff --git a/unicode-data/exe/Parser/Text.hs b/unicode-data/exe/Parser/Text.hs
@@ -237,6 +237,10 @@ enumMapToAddrLiteral xs cs = foldr go cs xs
         then fromIntegral w
         else error $ "Cannot convert to Word8: " <> show a
 
+-- Encode Word32 to [Word8] little endian
+word32ToWord8s :: Word32 -> [Word8]
+word32ToWord8s n = (\k -> fromIntegral ((n `shiftR` k) .&. 0xff)) <$> [0,8..24]
+
 -- This bit of code is duplicated but this duplication allows us to reduce 2
 -- dependencies on the executable.
 
@@ -278,55 +282,101 @@ genBlocksModule
 genBlocksModule moduleName = done <$> Fold.foldl' step initial
     where
 
-    done (blocks, defs, ranges, _) = unlines
+    done (blocks, defs, ranges) = let ranges' = reverse ranges in unlines
         [ apacheLicense 2022 moduleName
-        , "{-# LANGUAGE MultiWayIf #-}"
+        , "{-# LANGUAGE CPP, MultiWayIf #-}"
         , "{-# OPTIONS_HADDOCK hide #-}"
         , ""
         , "module " <> moduleName
-        , "(Block(..), BlockDefinition(..), block, blockDefinition)"
+        , "(Block(..), BlockDefinition(..), block, blockDefinition, allBlockRanges)"
         , "where"
         , ""
-        , "import Data.Char (ord)"
         , "import Data.Ix (Ix)"
+        , "import GHC.Exts"
         , ""
         , "-- [TODO] @since"
         , "-- | Unicode block."
         , "data Block"
-        , "  = " <> mconcat (intersperse "\n  | " (reverse blocks))
-        , "  deriving (Enum, Bounded, Eq, Ord, Ix, Show)"
+        , "    = " <> mconcat (intersperse "\n  | " (reverse blocks))
+        , "    deriving (Enum, Bounded, Eq, Ord, Ix, Show)"
         , ""
         , "-- [TODO] @since"
         , "-- | Block definition: range and name."
         , "data BlockDefinition = BlockDefinition"
-        , "  { blockRange :: !(Int, Int) -- ^ Range"
-        , "  , blockName :: !String -- ^ Name"
-        , "  } deriving (Eq, Ord, Show)"
+        , "    { blockRange :: !(Int, Int) -- ^ Range"
+        , "    , blockName :: !String -- ^ Name"
+        , "    } deriving (Eq, Ord, Show)"
         , ""
         , "-- [TODO] @since"
         , "-- | Block definition"
         , "blockDefinition :: Block -> BlockDefinition"
         , "blockDefinition b = case b of"
         , mconcat (reverse defs)
         , "-- [TODO] @since"
+        , "-- | All the block ranges, in ascending order."
+        , "{-# INLINE allBlockRanges #-}"
+        , "allBlockRanges :: [(Int, Int)]"
+        , "allBlockRanges ="
+        , "    " <> show ranges'
+        , ""
+        , "-- [TODO] @since"
         , "-- | Character block, if defined."
-        , "block :: Char -> Maybe Block"
-        , "block c"
-        , mconcat (reverse ranges)
-        , "  | otherwise = Nothing"
-        , "  where cp = ord c"
+        , "block :: Char -> Maybe Int"
+        , "block (C# c#) = getBlock 0# " <> shows (length ranges - 1) "#"
+        , "    where"
+        , "    -- [NOTE] Encoding"
+        , "    -- A range is encoded as two LE Word32:"
+        , "    -- • First one is the lower bound, where the higher 11 bits are the block"
+        , "    --   index and the lower 21 bits are the codepoint."
+        , "    -- • Second one is the upper bound, which correspond to the codepoint."
+        , ""
+        , "    cp# = int2Word# (ord# c#)"
+        , ""
+        , "    -- Binary search"
+        , "    getBlock l# u# = if isTrue# (l# ># u#)"
+        , "        then Nothing"
+        , "        else"
+        , "            let k# = l# +# uncheckedIShiftRL# (u# -# l#) 1#"
+        , "                j# = k# `uncheckedIShiftL#` 1#"
+        , "                cpL0# = getRawCodePoint# j#"
+        , "                cpL# = cpL0# `and#` 0x1fffff## -- Mask for codepoint: [0..0x10fff]"
+        , "                cpU# = getRawCodePoint# (j# +# 1#)"
+        , "            in if isTrue# (cpU# `ltWord#` cp#)"
+        , "                -- cp > upper bound"
+        , "                then getBlock (k# +# 1#) u#"
+        , "                -- check lower bound"
+        , "                else if isTrue# (cp# `ltWord#` cpL#)"
+        , "                    -- cp < lower bound"
+        , "                    then getBlock l# (k# -# 1#)"
+        , "                    -- cp in block: get block index"
+        , "                    else let block# = cpL0# `uncheckedShiftRL#` 21#"
+        , "                         in Just (I# (word2Int# block#))"
+        , ""
+        , "    getRawCodePoint# k# ="
+        , "#ifdef WORDS_BIGENDIAN"
+        , "#if MIN_VERSION_base(4,16,0)"
+        , "        byteSwap32# (word32ToWord# (indexWord32OffAddr# addr# k#))"
+        , "#else"
+        , "        byteSwap32# (indexWord32OffAddr# ranges# k#)"
+        , "#endif"
+        , "#elif MIN_VERSION_base(4,16,0)"
+        , "        word32ToWord# (indexWord32OffAddr# ranges# k#)"
+        , "#else"
+        , "        indexWord32OffAddr# ranges# k#"
+        , "#endif"
+        , ""
+        , "    -- Encoded ranges"
+        , "    ranges# = \"" <> enumMapToAddrLiteral (mkRanges ranges') "\"#"
         ]
 
-    initial :: ([String], [String], [String], Int)
-    initial = (mempty, mempty, mempty, 0)
+    initial :: ([String], [String], [(Int, Int)])
+    initial = (mempty, mempty, mempty)
 
-    step (blocks, defs, ranges, expected) (blockName, blockRange) =
+    step (blocks, defs, ranges) (blockName, blockRange) =
         let blockID = mkHaskellConstructor blockName
-            precedByNoBlock = expected < fst blockRange
         in ( mkBlockConstructor blockID blockName blockRange : blocks
            , mkBlockDef   blockID blockName blockRange : defs
-           , mkBlockRange blockID blockRange precedByNoBlock : ranges
-           , succ (snd blockRange) )
+           , blockRange : ranges )
 
     mkBlockConstructor blockID blockName (l, u) = mconcat
         [ blockID
@@ -340,7 +390,7 @@ genBlocksModule moduleName = done <$> Fold.foldl' step initial
         ]
 
     mkBlockDef blockID blockName (l, u) = mconcat
-        [ "  "
+        [ "    "
         , blockID
         , " -> BlockDefinition (0x"
         , showPaddedHex l
@@ -351,22 +401,17 @@ genBlocksModule moduleName = done <$> Fold.foldl' step initial
         , "\n"
         ]
 
-    mkBlockRange :: String -> (Int, Int) -> Bool -> String
-    mkBlockRange blockID blockRange precedByNoBlock =
-        let {
-            noBlock = mconcat
-                [ "  | cp <= 0x"
-                , showHex (fst blockRange - 1) " = Nothing\n"
-                ];
-            block = mconcat
-                [ "  | cp <= 0x"
-                , showHex (snd blockRange) " = Just "
-                , blockID
-                , "\n"
-                ]
-        } in if precedByNoBlock
-            then noBlock <> block
-            else block
+    -- [NOTE] Encoding: a range is encoded as two LE Word32:
+    -- • First one is the lower bound, where the higher 11 bits are the block
+    --   index and the lower 21 bits are the codepoint.
+    -- • Second one is upper bound, which correspond to the codepoint.
+    mkRanges :: [(Int, Int)] -> [Word8]
+    mkRanges = foldMap (uncurry mkBlockRange) . zip [0..]
+    mkBlockRange :: Word32 -> (Int, Int) -> [Word8]
+    mkBlockRange idx (l, u) = encodeBound idx l <> encodeBound 0 u
+
+    encodeBound :: Word32 -> Int -> [Word8]
+    encodeBound idx n = word32ToWord8s ((idx `shiftL` 21) .|. fromIntegral n)
 
 genScriptsModule
     :: Monad m
@@ -493,11 +538,8 @@ genScriptsModule moduleName =
         Left  c      -> encodeBytes (fromIntegral (ord c))
         Right (l, u) -> encodeBytes (setBit (fromIntegral (ord l)) 31)
                      <> encodeBytes (setBit (fromIntegral (ord u)) 31)
-    encodeBytes = foldr addByte "" . toWord8s
+    encodeBytes = foldr addByte "" . word32ToWord8s
     addByte n acc = '\\' : shows n acc
-    -- Encode Word32 to [Word8] little endian
-    toWord8s :: Word32 -> [Word8]
-    toWord8s n = (\k -> fromIntegral ((n `shiftR` k) .&. 0xff)) <$> [0,8..24]
 
     mkCharScripts :: [String] -> [ScriptLine] -> String
     mkCharScripts scripts scriptsRanges =

diff --git a/unicode-data/lib/Unicode/Char/General/Blocks.hs b/unicode-data/lib/Unicode/Char/General/Blocks.hs
@@ -12,7 +12,7 @@
 module Unicode.Char.General.Blocks
     ( B.Block(..)
     , B.BlockDefinition(..)
-    , B.block
+    , block
     , B.blockDefinition
     , inBlock
     , allBlockRanges
@@ -26,6 +26,12 @@ import Data.Char (chr, ord)
 import Data.Ix (inRange)
 import qualified Unicode.Internal.Char.Blocks as B
 
+-- [TODO] @since
+-- | Character block, if defined.
+{-# INLINE block #-}
+block :: Char -> Maybe B.Block
+block = fmap toEnum . B.block
+
 -- [TODO] @since
 -- | Check if a character is in a block.
 {-# INLINE inBlock #-}
@@ -36,11 +42,11 @@ inBlock b = inRange (B.blockRange (B.blockDefinition b)) . ord
 -- | All the block ranges, in ascending order.
 {-# INLINE allBlockRanges #-}
 allBlockRanges :: [(Int, Int)]
-allBlockRanges = B.blockRange . B.blockDefinition <$> [minBound..maxBound]
+allBlockRanges = B.allBlockRanges
 
 -- [TODO] @since
 -- | Variant of 'allBlockRanges', with ranges expressed as 'Char's instead of
 -- 'Int's.
 {-# INLINE allBlockRanges' #-}
 allBlockRanges' :: [(Char, Char)]
-allBlockRanges' = (chr *** chr) <$> allBlockRanges
+allBlockRanges' = (chr *** chr) <$> B.allBlockRanges