diff --git a/unicode-data/Changelog.md b/unicode-data/Changelog.md index b182e0c..dba90fb 100644 --- a/unicode-data/Changelog.md +++ b/unicode-data/Changelog.md @@ -15,6 +15,11 @@ - Removed deprecated `Unicode.Char.Numeric.isNumber`. Use `Unicode.Char.Numeric.Compat.isNumber` instead. +### Deprecations + +- `Unicode.Char.General.isAlphaNum`. + Use `Unicode.Char.General.Compat.isAlphaNum` instead. + ## 0.5.0 (July 2024) - Fix the inlining of `Addr#` literals and reduce their size. This results in diff --git a/unicode-data/bench/Unicode/Char/General/CompatBench.hs b/unicode-data/bench/Unicode/Char/General/CompatBench.hs index f0a5db0..eaf0646 100644 --- a/unicode-data/bench/Unicode/Char/General/CompatBench.hs +++ b/unicode-data/bench/Unicode/Char/General/CompatBench.hs @@ -20,6 +20,10 @@ benchmarks r = bgroupWithCharRange "Unicode.Char.General.Compat" r $ \chars -> [ Bench "base" Char.isAlpha , Bench "unicode-data" GC.isAlpha ] + , bgroupWithChars "isAlphaNum" chars + [ Bench "base" Char.isAlphaNum + , Bench "unicode-data" GC.isAlphaNum + ] , bgroupWithChars "isLetter" chars [ Bench "base" Char.isLetter , Bench "unicode-data" GC.isLetter diff --git a/unicode-data/bench/Unicode/Char/GeneralBench.hs b/unicode-data/bench/Unicode/Char/GeneralBench.hs index 4703066..42b314c 100644 --- a/unicode-data/bench/Unicode/Char/GeneralBench.hs +++ b/unicode-data/bench/Unicode/Char/GeneralBench.hs @@ -25,10 +25,6 @@ benchmarks r = bgroupWithCharRange "Unicode.Char.General" r $ \chars -> , bgroupWithChars "isAlphabetic" chars [ Bench "unicode-data" G.isAlphabetic ] - , bgroupWithChars "isAlphaNum" chars - [ Bench "base" Char.isAlphaNum - , Bench "unicode-data" G.isAlphaNum - ] , bgroupWithChars "isControl" chars [ Bench "base" Char.isControl , Bench "unicode-data" G.isControl diff --git a/unicode-data/lib/Unicode/Char.hs b/unicode-data/lib/Unicode/Char.hs index 1227341..aee2100 100644 --- a/unicode-data/lib/Unicode/Char.hs +++ b/unicode-data/lib/Unicode/Char.hs @@ -47,7 +47,7 @@ where import Data.Char (chr, ord) import Unicode.Char.Case hiding (Unfold(..), Step(..)) import Unicode.Char.Case.Compat -import Unicode.Char.General +import Unicode.Char.General hiding (isAlphaNum) import Unicode.Char.General.Compat import Unicode.Char.Identifiers import Unicode.Char.Numeric diff --git a/unicode-data/lib/Unicode/Char/General.hs b/unicode-data/lib/Unicode/Char/General.hs index a222630..9f82f8c 100644 --- a/unicode-data/lib/Unicode/Char/General.hs +++ b/unicode-data/lib/Unicode/Char/General.hs @@ -100,11 +100,12 @@ import Control.Exception (assert) import Data.Bits ((.&.)) import Data.Char (isAscii, isLatin1, isAsciiUpper, isAsciiLower, ord) import Data.Ix (Ix) -import Unicode.Internal.Division (quotRem28) +import qualified Unicode.Char.General.Compat as Compat import qualified Unicode.Internal.Char.DerivedCoreProperties as P import qualified Unicode.Internal.Char.PropList as P import qualified Unicode.Internal.Char.UnicodeData.GeneralCategory as UC +import Unicode.Internal.Division (quotRem28) -------------------------------------------------------------------------------- -- General Category @@ -381,20 +382,19 @@ following 'GeneralCategory's, or 'False' otherwise: prop> isAlphaNum c == Data.Char.isAlphaNum c +__Note:__ this function is incompatible with 'isAlphabetic': + +>>> isAlphabetic '\x345' +True +>>> isAlphaNum '\x345' +False + @since 0.3.0 -} +{-# INLINE isAlphaNum #-} +{-# DEPRECATED isAlphaNum "Use Unicode.Char.General.Compat.isAlphaNum instead." #-} isAlphaNum :: Char -> Bool -isAlphaNum c = - let !cp = ord c - -- NOTE: The guard constant is updated at each Unicode revision. - -- It must be < 0x40000 to be accepted by generalCategoryPlanes0To3. - in cp <= UC.MaxIsAlphaNum && - let !gc = UC.generalCategoryPlanes0To3 cp - in gc <= UC.OtherLetter || - (UC.DecimalNumber <= gc && gc <= UC.OtherNumber) - -- Use the following in case the previous code is not valid anymore: - -- gc <= UC.OtherLetter || (UC.DecimalNumber <= gc && gc <= UC.OtherNumber) - -- where !gc = UC.generalCategory c +isAlphaNum = Compat.isAlphaNum {-| Selects control characters, which are the non-printing characters of the Latin-1 subset of Unicode. diff --git a/unicode-data/lib/Unicode/Char/General/Compat.hs b/unicode-data/lib/Unicode/Char/General/Compat.hs index adb5b5f..3ca4204 100644 --- a/unicode-data/lib/Unicode/Char/General/Compat.hs +++ b/unicode-data/lib/Unicode/Char/General/Compat.hs @@ -13,6 +13,7 @@ -- module Unicode.Char.General.Compat ( isAlpha + , isAlphaNum , isLetter , isSpace ) where @@ -20,6 +21,9 @@ module Unicode.Char.General.Compat import Data.Char (ord) import qualified Unicode.Internal.Char.UnicodeData.GeneralCategory as UC +-- $setup +-- import qualified Unicode.Char.General + -- | Same as 'isLetter'. -- -- @since 0.3.0 @@ -27,6 +31,46 @@ import qualified Unicode.Internal.Char.UnicodeData.GeneralCategory as UC isAlpha :: Char -> Bool isAlpha = isLetter +{-| Selects alphabetic or numeric Unicode characters. + +This function returns 'True' if its argument has one of the +following 'GeneralCategory's, or 'False' otherwise: + +* 'UppercaseLetter' +* 'LowercaseLetter' +* 'TitlecaseLetter' +* 'ModifierLetter' +* 'OtherLetter' +* 'DecimalNumber' +* 'LetterNumber' +* 'OtherNumber' + +prop> isAlphaNum c == Data.Char.isAlphaNum c + +__Note:__ this function is incompatible with 'Unicode.Char.General.isAlphabetic': + +>>> Unicode.Char.General.isAlphabetic '\x345' +True +>>> isAlphaNum '\x345' +False + +@since 0.6.0 moved to Compat module + +@since 0.3.0 +-} +isAlphaNum :: Char -> Bool +isAlphaNum c = + let !cp = ord c + -- NOTE: The guard constant is updated at each Unicode revision. + -- It must be < 0x40000 to be accepted by generalCategoryPlanes0To3. + in cp <= UC.MaxIsAlphaNum && + let !gc = UC.generalCategoryPlanes0To3 cp + in gc <= UC.OtherLetter || + (UC.DecimalNumber <= gc && gc <= UC.OtherNumber) + -- Use the following in case the previous code is not valid anymore: + -- gc <= UC.OtherLetter || (UC.DecimalNumber <= gc && gc <= UC.OtherNumber) + -- where !gc = UC.generalCategory c + {-| Selects alphabetic Unicode characters (lower-case, upper-case and title-case letters, plus letters of caseless scripts and modifiers letters).