Skip to content

Commit

Permalink
Add support for Unicode scripts.
Browse files Browse the repository at this point in the history
  • Loading branch information
wismill committed Jun 15, 2022
1 parent 98d079f commit c07d42e
Show file tree
Hide file tree
Showing 6 changed files with 709 additions and 7 deletions.
1 change: 1 addition & 0 deletions ucd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ FILES="\
ucd/UnicodeData.txt:36018e68657fdcb3485f636630ffe8c8532e01c977703d2803f5b89d6c5feafb \
ucd/PropList.txt:6bddfdb850417a5bee6deff19290fd1b138589909afb50f5a049f343bf2c6722 \
ucd/NameAliases.txt:14b3b677d33f95c51423dce6eef4a6a28b4b160451ecedee4b91edb6745cf4a3 \
ucd/Scripts.txt:52db475c4ec445e73b0b16915448c357614946ad7062843c563e00d7535c6510 \
ucd/extracted/DerivedCombiningClass.txt:12b0c3af9b600b49488d66545a3e7844ea980809627201bf9afeebe1c9f16f4e \
ucd/extracted/DerivedName.txt:fef3e11514ba152f0d38a09f8018c03a825f846dbb912334c1e5c9fb29392a02 \
ucd/extracted/DerivedNumericValues.txt:11075771b112e8e7ccf6ffa637c4c91eadc3ef3db0517b24e605df8fd3624239"
Expand Down
201 changes: 194 additions & 7 deletions unicode-data/exe/Parser/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,13 @@ import Control.Monad.IO.Class (MonadIO(liftIO))
import Data.Bifunctor (Bifunctor(..))
import Data.Bits (Bits(..))
import Data.Char (chr, ord, isAlphaNum, isAscii, isSpace, toUpper)
import Data.Function ((&))
import Data.Foldable (foldl')
import Data.Function (on, (&))
import Data.Functor ((<&>))
import Data.List (dropWhileEnd, elemIndex, intersperse, sort, unfoldr)
import Data.List (dropWhileEnd, elemIndex, groupBy, intersperse, sort, unfoldr)
import Data.Maybe (fromMaybe)
import Data.Ratio ((%))
import Data.Word (Word8)
import Data.Word (Word8, Word32)
import Numeric (showHex)
import Streamly.Data.Fold (Fold)
import Streamly.Prelude (IsStream, SerialT)
Expand All @@ -50,8 +51,6 @@ import qualified Streamly.FileSystem.Handle as Handle
import qualified System.IO as Sys
import qualified Streamly.Unicode.Stream as Unicode

import Prelude hiding (pred)

-------------------------------------------------------------------------------
-- Types
-------------------------------------------------------------------------------
Expand Down Expand Up @@ -93,6 +92,8 @@ data DetailedChar =
}
deriving (Show)

type CharRange = Either Char (Char, Char)

-------------------------------------------------------------------------------
-- Helpers
-------------------------------------------------------------------------------
Expand Down Expand Up @@ -367,6 +368,153 @@ genBlocksModule moduleName = done <$> Fold.foldl' step initial
then noBlock <> block
else block

genScriptsModule
:: Monad m
=> String
-> Fold m ScriptLine String
genScriptsModule moduleName =
done <$> Fold.foldl' addRange mempty
where

done ranges =
let scripts = Set.toList (foldr addScript (Set.singleton "Unknown") ranges)
in unlines
[ apacheLicense 2022 moduleName
, "{-# LANGUAGE MultiWayIf #-}"
, "{-# OPTIONS_HADDOCK hide #-}"
, ""
, "module " <> moduleName
, "(Script(..), script, scriptDefinition)"
, "where"
, ""
, "import Data.Char (ord)"
, "import Data.Int (Int32)"
, "import Data.Ix (Ix)"
, "import GHC.Exts (Ptr(..))"
, "import Unicode.Internal.Bits (lookupIntN)"
, ""
, "-- [TODO] @since"
, "-- | Unicode script."
, "data Script"
, " = " <> mkScripts scripts
, " deriving (Enum, Bounded, Eq, Ord, Ix, Show)"
, ""
, "-- [TODO] @since"
, "-- | Script definition: list of corresponding characters."
, "scriptDefinition :: Script -> (Ptr Int32, Int)"
, "scriptDefinition b = case b of"
, mkScriptDefinitions ranges
, "-- [TODO] @since"
, "-- | Script of a character."
, if length scripts <= 0xff
then mkCharScripts scripts ranges
else error "Cannot encode scripts"
, ""
]

addRange :: [ScriptLine] -> ScriptLine -> [ScriptLine]
addRange acc l@(script, r) = case acc of
(script', r'):acc' -> if script == script'
then case combineRanges r r' of
Left r'' -> (script, r'') : acc
Right r'' -> (script, r'') : acc'
else l : acc
_ -> [l]

combineRanges :: CharRange -> CharRange -> Either CharRange CharRange
combineRanges r = case r of
Left c1 -> \case
Left c2 -> if c1 == succ c2
then Right (Right (c2, c1))
else Left r
Right (c2, c3) -> if c1 == succ c3
then Right (Right (c2, c1))
else Left r
Right (c1, c2) -> \case
Left c3 -> if c1 == succ c3
then Right (Right (c3, c2))
else Left r
Right (c3, c4) -> if c1 == succ c4
then Right (Right (c3, c2))
else Left r

addScript :: ScriptLine -> Set.Set String -> Set.Set String
addScript (script, _) = Set.insert script

mkScripts scripts = mconcat (intersperse "\n | " scripts)

mkScriptDefinitions :: [ScriptLine] -> String
mkScriptDefinitions
= foldMap mkScriptDefinition
. groupBy ((==) `on` fst)
. reverse
. addUnknownRanges

addUnknownRanges :: [ScriptLine] -> [ScriptLine]
addUnknownRanges ls =
let addUnknown (acc, expected) (c, _) = case mkMissingRange expected c of
Just r -> (,succ c) $ case acc of
r':acc' -> either (:acc) (:acc') (combineRanges r r')
_ -> [r]
Nothing -> (acc, succ expected)
addRest (acc@(r':acc'), expected) =
let r = Right (expected, maxBound)
in either (:acc) (:acc') (combineRanges r r')
addRest _ = error "impossible"
unknown = fmap ("Unknown",) . addRest $ foldl'
addUnknown
(mempty, '\0')
(sort (foldMap (rangeToCharScripts id) ls))
in unknown <> ls

mkMissingRange :: Char -> Char -> Maybe CharRange
mkMissingRange expected c
| c == expected = Nothing
| c == succ expected = Just (Left expected)
| otherwise = Just (Right (expected, pred c))

mkScriptDefinition :: [ScriptLine] -> String
mkScriptDefinition ranges = mconcat
[ " "
, fst (head ranges)
, " -> (Ptr \""
, foldMap encodeRange ranges
, "\"#, "
, show (foldr (\r -> either (const (+1)) (const (+2)) (snd r)) 0 ranges :: Word)
, ")\n"
]

-- Encoding:
-- • A single char is encoded as an LE Int32.
-- • A range is encoded as two LE Int32 (first is lower bound, second is
-- upper bound), which correspond to the codepoints with the 32th bit set.
encodeRange :: ScriptLine -> String
encodeRange (_, r) = case r of
Left c -> encodeBytes (fromIntegral (ord c))
Right (l, u) -> encodeBytes (setBit (fromIntegral (ord l)) 31)
<> encodeBytes (setBit (fromIntegral (ord u)) 31)
encodeBytes = foldr addByte "" . toWord8s
addByte n acc = '\\' : shows n acc
-- Encode Word32 to [Word8] little endian
toWord8s :: Word32 -> [Word8]
toWord8s n = (\k -> fromIntegral ((n `shiftR` k) .&. 0xff)) <$> [0,8..24]

mkCharScripts :: [String] -> [ScriptLine] -> String
mkCharScripts scripts scriptsRanges =
let charScripts = sort (foldMap (rangeToCharScripts getScript) scriptsRanges)
charScripts' = fst (foldl' addMissing (mempty, '\0') charScripts)
addMissing (acc, expected) x@(c, script) = if expected < c
then addMissing (def:acc, succ expected) x
else (script:acc, succ c)
def = getScript "Unknown"
getScript s = fromMaybe (error "script not found") (elemIndex s scripts)
in genEnumBitmap "script" def (reverse charScripts')

rangeToCharScripts :: (String -> b) -> ScriptLine -> [(Char, b)]
rangeToCharScripts f (script, r) = case r of
Left cp -> [(cp, f script)]
Right (l, u) -> (, f script) <$> [l..u]

-------------------------------------------------------------------------------
-- Parsing UnicodeData.txt
-------------------------------------------------------------------------------
Expand Down Expand Up @@ -529,8 +677,8 @@ genDecomposeDefModule ::
-> DType
-> (Int -> Bool)
-> Fold m DetailedChar String
genDecomposeDefModule moduleName before after dtype pred =
Fold.filter (pred . ord . _char)
genDecomposeDefModule moduleName before after dtype predicate =
Fold.filter (predicate . ord . _char)
$ filterNonHangul
$ filterDecomposableType dtype $ done <$> Fold.foldl' step initial

Expand Down Expand Up @@ -966,6 +1114,34 @@ parseBlockLines = Stream.mapMaybe parseBlockLine
-- Parsing script file
-------------------------------------------------------------------------------

type ScriptLine = (String, Either Char (Char, Char))

parseScriptLine :: String -> Maybe ScriptLine
parseScriptLine ln
| null ln = Nothing
| head ln == '#' = Nothing
| otherwise = Just (parseLine ln)

where

parseLine line =
let (rangeLn, line1) = span (/= ';') line
script = takeWhile (/= '#') (tail line1)

in (trim script, parseRange (trim rangeLn))

parseRange :: String -> Either Char (Char, Char)
parseRange
= (\(c1, c2) -> maybe (Left c1) (Right . (c1,)) c2)
. bimap readCodePoint (readCodePointM . drop 2)
. span (/= '.')

parseScriptLines
:: (IsStream t, Monad m)
=> t m String
-> t m ScriptLine
parseScriptLines = Stream.mapMaybe parseScriptLine

-------------------------------------------------------------------------------
-- Parsing property files
-------------------------------------------------------------------------------
Expand Down Expand Up @@ -1370,6 +1546,13 @@ genCoreModules indir outdir props = do
outdir
[ blocks ]

runGenerator
indir
"Scripts.txt"
parseScriptLines
outdir
[ scripts ]

runGenerator
indir
"UnicodeData.txt"
Expand Down Expand Up @@ -1415,6 +1598,10 @@ genCoreModules indir outdir props = do
( "Unicode.Internal.Char.Blocks"
, genBlocksModule)

scripts =
( "Unicode.Internal.Char.Scripts"
, genScriptsModule)

propList =
("Unicode.Internal.Char.PropList"
, (`genCorePropertiesModule` (`elem` props)))
Expand Down
123 changes: 123 additions & 0 deletions unicode-data/lib/Unicode/Char/General/Scripts.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
{-# LANGUAGE CPP #-}

-- [TODO] @since
-- |
-- Module : Unicode.Char.General
-- Copyright : (c) 2020 Composewell Technologies and Contributors
-- License : Apache-2.0
-- Maintainer : [email protected]
-- Stability : experimental
--
-- Unicode scripts related functions.
--

module Unicode.Char.General.Scripts
( S.Script(..)
, script
, scriptDefinition
, inScript
)
where

import Data.Char (chr)
import GHC.Exts
(Ptr(..), Char(..), Int(..),
indexWord32OffAddr#, word2Int#, int2Word#,
and#, isTrue#, eqWord#, leWord#, neWord#,
andI#, (-#), (<#),
chr#, ord#)
#if MIN_VERSION_base(4,16,0)
import GHC.Exts (word32ToWord#)
#endif
#ifdef WORDS_BIGENDIAN
import GHC.Exts (byteSwap32#)
#endif

import qualified Unicode.Internal.Char.Scripts as S

-- [TODO] @since
-- | Character script
{-# INLINE script #-}
script :: Char -> S.Script
script = toEnum . S.script

{- HLINT ignore scriptDefinition "Eta reduce" -}
-- [TODO] @since
-- | Characters correspinding to a 'S.Script'.
scriptDefinition :: S.Script -> String
scriptDefinition = unpack . S.scriptDefinition
where
-- [NOTE] Encoding:
-- • A single char is encoded as an LE Word32.
-- • A range is encoded as two LE Word32 (first is lower bound, second is
-- upper bound), which correspond to the codepoints with the 32th bit set.

scriptRangeMask# = 0x80000000## -- 1 << 31
maskComplement# = 0x7fffffff## -- 1 << 31 ^ 0xffffffff

unpack (Ptr addr#, I# n#) = let {
getRawCodePoint k# =
#ifdef WORDS_BIGENDIAN
#if MIN_VERSION_base(4,16,0)
byteSwap32# (word32ToWord# (indexWord32OffAddr# addr# k#));
#else
byteSwap32# (indexWord32OffAddr# addr# k#);
#endif
#elif MIN_VERSION_base(4,16,0)
word32ToWord# (indexWord32OffAddr# addr# k#);
#else
indexWord32OffAddr# addr# k#;
#endif
getCodePoint k# = word2Int# (and# maskComplement# k#);
addRange k# acc = if isTrue# (k# <# 0#)
then acc
else let {
r1# = getRawCodePoint k#;
c1# = getCodePoint r1#;
isRange = isTrue# (and# r1# scriptRangeMask# `neWord#` 0##)
} in if isRange
then let {
c2# = getCodePoint (getRawCodePoint (k# -# 1#));
acc' = foldr ((:) . chr) acc [I# c2# .. I# c1#]
} in addRange (k# -# 2#) acc'
else addRange (k# -# 1#) (C# (chr# c1#) : acc)
} in addRange (n# -# 1#) mempty

{- HLINT ignore inScript "Eta reduce" -}
-- [TODO] @since
-- | Check if a character is in a 'S.Script'.
inScript :: S.Script -> Char -> Bool
inScript s (C# c#) = check (S.scriptDefinition s)
where
-- [NOTE] see 'scriptDefinition' for the description of the encoding.

scriptRangeMask# = 0x80000000## -- 1 << 31
maskComplement# = 0x7fffffff## -- 1 << 31 ^ 0xffffffff
cp# = int2Word# (ord# c#)

check (Ptr addr#, I# n#) = let {
getRawCodePoint k# =
#ifdef WORDS_BIGENDIAN
#if MIN_VERSION_base(4,16,0)
byteSwap32# (word32ToWord# (indexWord32OffAddr# addr# k#));
#else
byteSwap32# (indexWord32OffAddr# addr# k#);
#endif
#elif MIN_VERSION_base(4,16,0)
word32ToWord# (indexWord32OffAddr# addr# k#);
#else
indexWord32OffAddr# addr# k#;
#endif
getCodePoint k# = and# maskComplement# k#;
find k# = not (isTrue# (k# <# 0#)) &&
let {
r1# = getRawCodePoint k#;
c1# = getCodePoint r1#;
isRange = isTrue# (and# r1# scriptRangeMask# `neWord#` 0##)
} in if isRange
then let {
c2# = getCodePoint (getRawCodePoint (k# -# 1#));
found = isTrue# ((c2# `leWord#` cp#) `andI#` (cp# `leWord#` c1#))
} in found || find (k# -# 2#)
else isTrue# (c1# `eqWord#` cp#) || find (k# -# 1#)
} in find (n# -# 1#)
Loading

0 comments on commit c07d42e

Please sign in to comment.