Skip to content

Commit

Permalink
Improve block lookup
Browse files Browse the repository at this point in the history
  • Loading branch information
wismill committed Jun 15, 2022
1 parent f24893a commit 9f0a284
Show file tree
Hide file tree
Showing 3 changed files with 472 additions and 747 deletions.
124 changes: 83 additions & 41 deletions unicode-data/exe/Parser/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,10 @@ enumMapToAddrLiteral xs cs = foldr go cs xs
then fromIntegral w
else error $ "Cannot convert to Word8: " <> show a

-- Encode Word32 to [Word8] little endian
word32ToWord8s :: Word32 -> [Word8]
word32ToWord8s n = (\k -> fromIntegral ((n `shiftR` k) .&. 0xff)) <$> [0,8..24]

-- This bit of code is duplicated but this duplication allows us to reduce 2
-- dependencies on the executable.

Expand Down Expand Up @@ -278,55 +282,101 @@ genBlocksModule
genBlocksModule moduleName = done <$> Fold.foldl' step initial
where

done (blocks, defs, ranges, _) = unlines
done (blocks, defs, ranges) = let ranges' = reverse ranges in unlines
[ apacheLicense 2022 moduleName
, "{-# LANGUAGE MultiWayIf #-}"
, "{-# LANGUAGE CPP, MultiWayIf #-}"
, "{-# OPTIONS_HADDOCK hide #-}"
, ""
, "module " <> moduleName
, "(Block(..), BlockDefinition(..), block, blockDefinition)"
, "(Block(..), BlockDefinition(..), block, blockDefinition, allBlockRanges)"
, "where"
, ""
, "import Data.Char (ord)"
, "import Data.Ix (Ix)"
, "import GHC.Exts"
, ""
, "-- [TODO] @since"
, "-- | Unicode block."
, "data Block"
, " = " <> mconcat (intersperse "\n | " (reverse blocks))
, " deriving (Enum, Bounded, Eq, Ord, Ix, Show)"
, " = " <> mconcat (intersperse "\n | " (reverse blocks))
, " deriving (Enum, Bounded, Eq, Ord, Ix, Show)"
, ""
, "-- [TODO] @since"
, "-- | Block definition: range and name."
, "data BlockDefinition = BlockDefinition"
, " { blockRange :: !(Int, Int) -- ^ Range"
, " , blockName :: !String -- ^ Name"
, " } deriving (Eq, Ord, Show)"
, " { blockRange :: !(Int, Int) -- ^ Range"
, " , blockName :: !String -- ^ Name"
, " } deriving (Eq, Ord, Show)"
, ""
, "-- [TODO] @since"
, "-- | Block definition"
, "blockDefinition :: Block -> BlockDefinition"
, "blockDefinition b = case b of"
, mconcat (reverse defs)
, "-- [TODO] @since"
, "-- | All the block ranges, in ascending order."
, "{-# INLINE allBlockRanges #-}"
, "allBlockRanges :: [(Int, Int)]"
, "allBlockRanges ="
, " " <> show ranges'
, ""
, "-- [TODO] @since"
, "-- | Character block, if defined."
, "block :: Char -> Maybe Block"
, "block c"
, mconcat (reverse ranges)
, " | otherwise = Nothing"
, " where cp = ord c"
, "block :: Char -> Maybe Int"
, "block (C# c#) = getBlock 0# " <> shows (length ranges - 1) "#"
, " where"
, " -- [NOTE] Encoding"
, " -- A range is encoded as two LE Word32:"
, " -- • First one is the lower bound, where the higher 11 bits are the block"
, " -- index and the lower 21 bits are the codepoint."
, " -- • Second one is the upper bound, which correspond to the codepoint."
, ""
, " cp# = int2Word# (ord# c#)"
, ""
, " -- Binary search"
, " getBlock l# u# = if isTrue# (l# ># u#)"
, " then Nothing"
, " else"
, " let k# = l# +# uncheckedIShiftRL# (u# -# l#) 1#"
, " j# = k# `uncheckedIShiftL#` 1#"
, " cpL0# = getRawCodePoint# j#"
, " cpL# = cpL0# `and#` 0x1fffff## -- Mask for codepoint: [0..0x10fff]"
, " cpU# = getRawCodePoint# (j# +# 1#)"
, " in if isTrue# (cpU# `ltWord#` cp#)"
, " -- cp > upper bound"
, " then getBlock (k# +# 1#) u#"
, " -- check lower bound"
, " else if isTrue# (cp# `ltWord#` cpL#)"
, " -- cp < lower bound"
, " then getBlock l# (k# -# 1#)"
, " -- cp in block: get block index"
, " else let block# = cpL0# `uncheckedShiftRL#` 21#"
, " in Just (I# (word2Int# block#))"
, ""
, " getRawCodePoint# k# ="
, "#ifdef WORDS_BIGENDIAN"
, "#if MIN_VERSION_base(4,16,0)"
, " byteSwap32# (word32ToWord# (indexWord32OffAddr# addr# k#))"
, "#else"
, " byteSwap32# (indexWord32OffAddr# ranges# k#)"
, "#endif"
, "#elif MIN_VERSION_base(4,16,0)"
, " word32ToWord# (indexWord32OffAddr# ranges# k#)"
, "#else"
, " indexWord32OffAddr# ranges# k#"
, "#endif"
, ""
, " -- Encoded ranges"
, " ranges# = \"" <> enumMapToAddrLiteral (mkRanges ranges') "\"#"
]

initial :: ([String], [String], [String], Int)
initial = (mempty, mempty, mempty, 0)
initial :: ([String], [String], [(Int, Int)])
initial = (mempty, mempty, mempty)

step (blocks, defs, ranges, expected) (blockName, blockRange) =
step (blocks, defs, ranges) (blockName, blockRange) =
let blockID = mkHaskellConstructor blockName
precedByNoBlock = expected < fst blockRange
in ( mkBlockConstructor blockID blockName blockRange : blocks
, mkBlockDef blockID blockName blockRange : defs
, mkBlockRange blockID blockRange precedByNoBlock : ranges
, succ (snd blockRange) )
, blockRange : ranges )

mkBlockConstructor blockID blockName (l, u) = mconcat
[ blockID
Expand All @@ -340,7 +390,7 @@ genBlocksModule moduleName = done <$> Fold.foldl' step initial
]

mkBlockDef blockID blockName (l, u) = mconcat
[ " "
[ " "
, blockID
, " -> BlockDefinition (0x"
, showPaddedHex l
Expand All @@ -351,22 +401,17 @@ genBlocksModule moduleName = done <$> Fold.foldl' step initial
, "\n"
]

mkBlockRange :: String -> (Int, Int) -> Bool -> String
mkBlockRange blockID blockRange precedByNoBlock =
let {
noBlock = mconcat
[ " | cp <= 0x"
, showHex (fst blockRange - 1) " = Nothing\n"
];
block = mconcat
[ " | cp <= 0x"
, showHex (snd blockRange) " = Just "
, blockID
, "\n"
]
} in if precedByNoBlock
then noBlock <> block
else block
-- [NOTE] Encoding: a range is encoded as two LE Word32:
-- • First one is the lower bound, where the higher 11 bits are the block
-- index and the lower 21 bits are the codepoint.
-- • Second one is upper bound, which correspond to the codepoint.
mkRanges :: [(Int, Int)] -> [Word8]
mkRanges = foldMap (uncurry mkBlockRange) . zip [0..]
mkBlockRange :: Word32 -> (Int, Int) -> [Word8]
mkBlockRange idx (l, u) = encodeBound idx l <> encodeBound 0 u

encodeBound :: Word32 -> Int -> [Word8]
encodeBound idx n = word32ToWord8s ((idx `shiftL` 21) .|. fromIntegral n)

genScriptsModule
:: Monad m
Expand Down Expand Up @@ -493,11 +538,8 @@ genScriptsModule moduleName =
Left c -> encodeBytes (fromIntegral (ord c))
Right (l, u) -> encodeBytes (setBit (fromIntegral (ord l)) 31)
<> encodeBytes (setBit (fromIntegral (ord u)) 31)
encodeBytes = foldr addByte "" . toWord8s
encodeBytes = foldr addByte "" . word32ToWord8s
addByte n acc = '\\' : shows n acc
-- Encode Word32 to [Word8] little endian
toWord8s :: Word32 -> [Word8]
toWord8s n = (\k -> fromIntegral ((n `shiftR` k) .&. 0xff)) <$> [0,8..24]

mkCharScripts :: [String] -> [ScriptLine] -> String
mkCharScripts scripts scriptsRanges =
Expand Down
12 changes: 9 additions & 3 deletions unicode-data/lib/Unicode/Char/General/Blocks.hs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
module Unicode.Char.General.Blocks
( B.Block(..)
, B.BlockDefinition(..)
, B.block
, block
, B.blockDefinition
, inBlock
, allBlockRanges
Expand All @@ -26,6 +26,12 @@ import Data.Char (chr, ord)
import Data.Ix (inRange)
import qualified Unicode.Internal.Char.Blocks as B

-- [TODO] @since
-- | Character block, if defined.
{-# INLINE block #-}
block :: Char -> Maybe B.Block
block = fmap toEnum . B.block

-- [TODO] @since
-- | Check if a character is in a block.
{-# INLINE inBlock #-}
Expand All @@ -36,11 +42,11 @@ inBlock b = inRange (B.blockRange (B.blockDefinition b)) . ord
-- | All the block ranges, in ascending order.
{-# INLINE allBlockRanges #-}
allBlockRanges :: [(Int, Int)]
allBlockRanges = B.blockRange . B.blockDefinition <$> [minBound..maxBound]
allBlockRanges = B.allBlockRanges

-- [TODO] @since
-- | Variant of 'allBlockRanges', with ranges expressed as 'Char's instead of
-- 'Int's.
{-# INLINE allBlockRanges' #-}
allBlockRanges' :: [(Char, Char)]
allBlockRanges' = (chr *** chr) <$> allBlockRanges
allBlockRanges' = (chr *** chr) <$> B.allBlockRanges
Loading

0 comments on commit 9f0a284

Please sign in to comment.