Skip to content

Commit

Permalink
Remove inScript and add scriptDefinition benchmark.
Browse files Browse the repository at this point in the history
  • Loading branch information
wismill committed Sep 20, 2022
1 parent 7f709eb commit 3987ba8
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 22 deletions.
31 changes: 24 additions & 7 deletions unicode-data-scripts/bench/Main.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import Control.DeepSeq (NFData, deepseq, force)
import Control.Exception (evaluate)
import Data.Ix (Ix(..))
import Test.Tasty.Bench
(Benchmark, bgroup, bench, defaultMain, env, nf)

Expand All @@ -10,22 +11,38 @@ main :: IO ()
main = defaultMain
[ bgroup "Unicode.Char.General.Script"
[ bgroup "script"
[ benchNF "unicode-data" (show . S.script)
[ benchChars "unicode-data" (show . S.script)
]
, bgroup "scriptDefinition"
[ benchNF "unicode-data" (show . S.scriptDefinition)
]
-- [TODO] scriptDefinition, inScript
]
]
where
benchNF :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
benchNF t f =
benchChars :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
benchChars t f =
-- Avoid side-effects with garbage collection (see tasty-bench doc)
env
(evaluate (force chars)) -- initialize
(bench t . nf (fold_ f)) -- benchmark
(bench t . nf (foldString f)) -- benchmark
where
-- Filter out: Surrogates, Private Use Areas and unsassigned code points
chars = filter isValid [minBound..maxBound]
isValid c = G.generalCategory c < G.Surrogate

fold_ :: forall a. (NFData a) => (Char -> a) -> String -> ()
fold_ f = foldr (deepseq . f) ()
foldString :: forall a. (NFData a) => (Char -> a) -> String -> ()
foldString f = foldr (deepseq . f) ()

benchNF
:: forall a b. (Bounded a, Ix a, NFData b)
=> String
-> (a -> b)
-> Benchmark
benchNF t f = bench t (nf (fold_ f) (minBound, maxBound))

fold_
:: forall a b. (Ix a, NFData b)
=> (a -> b)
-> (a, a)
-> ()
fold_ f = foldr (deepseq . f) () . range
7 changes: 0 additions & 7 deletions unicode-data-scripts/lib/Unicode/Char/General/Scripts.hs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ module Unicode.Char.General.Scripts
( S.Script(..)
, script
, scriptDefinition
, inScript
)
where

Expand All @@ -39,12 +38,6 @@ import qualified Unicode.Internal.Char.Scripts as S
script :: Char -> S.Script
script = toEnum . S.script

-- [TODO] @since
-- | Check if a character is in a 'S.Script'.
{-# INLINE inScript #-}
inScript :: S.Script -> Char -> Bool
inScript s = (== s) . script

{- HLINT ignore scriptDefinition "Eta reduce" -}
-- [TODO] @since
-- | Characters correspinding to a 'S.Script'.
Expand Down
66 changes: 58 additions & 8 deletions unicode-data-scripts/test/Unicode/Char/General/ScriptsSpec.hs
Original file line number Diff line number Diff line change
@@ -1,12 +1,26 @@
{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE BlockArguments, CPP #-}

module Unicode.Char.General.ScriptsSpec
( spec
) where

import qualified Unicode.Char.General.Scripts as UScripts
import Data.Foldable (traverse_)
import Test.Hspec
import qualified Unicode.Char.General.Scripts as UScripts
import qualified Unicode.Internal.Char.Scripts as S

import GHC.Exts
(Ptr(..), Char(..), Int(..),
indexWord32OffAddr#, int2Word#,
and#, isTrue#, eqWord#, leWord#, neWord#,
andI#, (-#), (<#),
ord#)
#if MIN_VERSION_base(4,16,0)
import GHC.Exts (word32ToWord#)
#endif
#ifdef WORDS_BIGENDIAN
import GHC.Exts (byteSwap32#)
#endif

{- [NOTE]
These tests may fail if the compiler’s Unicode version
Expand All @@ -26,14 +40,11 @@ does not match the version of this package.
spec :: Spec
spec = do
describe "Unicode scripts" do
it "inScript"
let check s = if all (UScripts.inScript s) (UScripts.scriptDefinition s)
then pure ()
else expectationFailure (show s)
in traverse_ check [minBound..maxBound]
it "Characters are in the definition of their corresponding script"
let {
check c = let s = UScripts.script c in if UScripts.inScript s c
check c =
let s = UScripts.script c
in if s `inScript` c
then pure ()
else expectationFailure $ mconcat
[ "Char “", show c, "” in not in the definition of “"
Expand All @@ -49,3 +60,42 @@ spec = do
check s = let chars = UScripts.scriptDefinition s
in traverse_ (checkChar s) chars
} in traverse_ check [minBound..maxBound]

{- HLINT ignore inScript "Eta reduce" -}
-- Check if a character is in a 'S.Script'.
-- This is faster than testing the string from UScripts.scriptDefinition
inScript :: S.Script -> Char -> Bool
inScript s (C# c#) = check (S.scriptDefinition s)
where
-- [NOTE] see 'scriptDefinition' for the description of the encoding.

scriptRangeMask# = 0x80000000## -- 1 << 31
maskComplement# = 0x7fffffff## -- 1 << 31 ^ 0xffffffff
cp# = int2Word# (ord# c#)

check (Ptr addr#, I# n#) = let {
getRawCodePoint k# =
#ifdef WORDS_BIGENDIAN
#if MIN_VERSION_base(4,16,0)
byteSwap32# (word32ToWord# (indexWord32OffAddr# addr# k#));
#else
byteSwap32# (indexWord32OffAddr# addr# k#);
#endif
#elif MIN_VERSION_base(4,16,0)
word32ToWord# (indexWord32OffAddr# addr# k#);
#else
indexWord32OffAddr# addr# k#;
#endif
getCodePoint k# = and# maskComplement# k#;
find k# = not (isTrue# (k# <# 0#)) &&
let {
r1# = getRawCodePoint k#;
c1# = getCodePoint r1#;
isRange = isTrue# (and# r1# scriptRangeMask# `neWord#` 0##)
} in if isRange
then let {
c2# = getCodePoint (getRawCodePoint (k# -# 1#));
found = isTrue# ((c2# `leWord#` cp#) `andI#` (cp# `leWord#` c1#))
} in found || find (k# -# 2#)
else isTrue# (c1# `eqWord#` cp#) || find (k# -# 1#)
} in find (n# -# 1#)

0 comments on commit 3987ba8

Please sign in to comment.