From 740e494525559b2c46217fe1ed28d1feb7998f70 Mon Sep 17 00:00:00 2001 From: Andreas Abel Date: Mon, 18 Jul 2022 10:15:27 +0200 Subject: [PATCH] Remove `Maybe` indirection in `PatternSet` (+ LambdaCase) Instead of `Maybe . Set` just have `Set`, collapsing the two null values `Nothing` and `Just mempty`. The distinction of these wasn't used anywhere. We drop GHC 7.4 in favor of using LambdaCase. --- .github/workflows/haskell-ci.yml | 5 -- lib/Text/Regex/TDFA/Pattern.hs | 94 +++++++++++++++++++++++--------- lib/Text/Regex/TDFA/ReadRegex.hs | 44 +++++++++------ regex-tdfa.cabal | 5 +- 4 files changed, 98 insertions(+), 50 deletions(-) diff --git a/.github/workflows/haskell-ci.yml b/.github/workflows/haskell-ci.yml index 8a9f826..45e46a6 100644 --- a/.github/workflows/haskell-ci.yml +++ b/.github/workflows/haskell-ci.yml @@ -92,11 +92,6 @@ jobs: compilerVersion: 7.6.3 setup-method: hvr-ppa allow-failure: false - - compiler: ghc-7.4.2 - compilerKind: ghc - compilerVersion: 7.4.2 - setup-method: hvr-ppa - allow-failure: false fail-fast: false steps: - name: apt diff --git a/lib/Text/Regex/TDFA/Pattern.hs b/lib/Text/Regex/TDFA/Pattern.hs index 066f1c7..5735da0 100644 --- a/lib/Text/Regex/TDFA/Pattern.hs +++ b/lib/Text/Regex/TDFA/Pattern.hs @@ -5,19 +5,23 @@ -- the parsed form of a regular expression. module Text.Regex.TDFA.Pattern - (Pattern(..) - ,PatternSet(..) - ,PatternSetCharacterClass(..) - ,PatternSetCollatingElement(..) - ,PatternSetEquivalenceClass(..) - ,GroupIndex - ,DoPa(..) - ,decodeCharacterClass, decodePatternSet - ,showPattern --- ** Internal use - ,starTrans --- ** Internal use, operations to support debugging under @ghci@ - ,starTrans',simplify',dfsPattern + ( Pattern(..) + , PatternSet(..) + , patternSetChars + , patternSetCharacterClasses + , patternSetCollatingElements + , patternSetEquivalenceClasses + , PatternSetCharacterClass(..) + , PatternSetCollatingElement(..) + , PatternSetEquivalenceClass(..) + , GroupIndex + , DoPa(..) + , decodeCharacterClass, decodePatternSet + , showPattern + -- ** Internal use + , starTrans + -- ** Internal use, operations to support debugging under @ghci@ + , starTrans', simplify', dfsPattern ) where {- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -} @@ -25,6 +29,8 @@ module Text.Regex.TDFA.Pattern import Data.List(intersperse,partition) import qualified Data.Set as Set import Data.Set (Set) + +import Utils import Text.Regex.TDFA.Common(DoPa(..),GroupIndex,common_error) err :: String -> a @@ -128,20 +134,54 @@ showPattern pIn = -- collating elements (e.g. @[.ch.]@, unused), and -- equivalence classes (e.g. @[=a=]@, treated as characters). -- -data PatternSet = PatternSet (Maybe (Set Char)) - (Maybe (Set PatternSetCharacterClass)) - (Maybe (Set PatternSetCollatingElement)) - (Maybe (Set PatternSetEquivalenceClass)) - deriving (Eq) +data PatternSet = PatternSet + { _patternSetChars :: Set Char + -- ^ Characters included in the pattern. + , _patternSetCharacterClasses :: Set PatternSetCharacterClass + -- ^ POSIX character classes included in the pattern. + , _patternSetCollatingElements :: Set PatternSetCollatingElement + -- ^ Collating elements included in the pattern. + , _patternSetEquivalenceClasses :: Set PatternSetEquivalenceClass + -- ^ Equivalence classes included in the pattern. + } + deriving (Eq) + +instance Semigroup PatternSet where + PatternSet a b c d <> PatternSet a' b' c' d' = + PatternSet (a <> a') (b <> b') (c <> c') (d <> d') + +instance Monoid PatternSet where + mempty = PatternSet mempty mempty mempty mempty + mappend = (<>) + +-- | Lens for '_patternSetChars'. +patternSetChars :: Lens' PatternSet (Set Char) +patternSetChars f ps = + f (_patternSetChars ps) <&> \ i -> ps{ _patternSetChars = i } + +-- | Lens for '_patternSetCharacterClasses'. +patternSetCharacterClasses :: Lens' PatternSet (Set PatternSetCharacterClass) +patternSetCharacterClasses f ps = + f (_patternSetCharacterClasses ps) <&> \ i -> ps{ _patternSetCharacterClasses = i } + +-- | Lens for '_patternSetCollatingElements'. +patternSetCollatingElements :: Lens' PatternSet (Set PatternSetCollatingElement) +patternSetCollatingElements f ps = + f (_patternSetCollatingElements ps) <&> \ i -> ps{ _patternSetCollatingElements = i } + +-- | Lens for '_patternSetEquivalenceClasses'. +patternSetEquivalenceClasses :: Lens' PatternSet (Set PatternSetEquivalenceClass) +patternSetEquivalenceClasses f ps = + f (_patternSetEquivalenceClasses ps) <&> \ i -> ps{ _patternSetEquivalenceClasses = i } -- | Hand-rolled implementation, giving textual rather than Haskell representation. instance Show PatternSet where showsPrec i (PatternSet s scc sce sec) = - let (special,normal) = maybe ("","") ((partition (`elem` "]-")) . Set.toAscList) s + let (special,normal) = partition (`elem` "]-") $ Set.toAscList s charSpec = (if ']' `elem` special then (']':) else id) (byRange normal) - scc' = maybe "" ((concatMap show) . Set.toList) scc - sce' = maybe "" ((concatMap show) . Set.toList) sce - sec' = maybe "" ((concatMap show) . Set.toList) sec + scc' = concatMap show $ Set.toList scc + sce' = concatMap show $ Set.toList sce + sec' = concatMap show $ Set.toList sec in shows charSpec . showsPrec i scc' . showsPrec i sce' . showsPrec i sec' . if '-' `elem` special then showChar '-' else id @@ -183,11 +223,11 @@ instance Show PatternSetEquivalenceClass where -- -- @since 1.3.2 decodePatternSet :: PatternSet -> Set Char -decodePatternSet (PatternSet msc mscc _ msec) = - let baseMSC = maybe Set.empty id msc - withMSCC = foldl (flip Set.insert) baseMSC (maybe [] (concatMap decodeCharacterClass . Set.toAscList) mscc) - withMSEC = foldl (flip Set.insert) withMSCC (maybe [] (concatMap unSEC . Set.toAscList) msec) - in withMSEC +decodePatternSet (PatternSet chars ccs _ eqcs) = Set.unions + [ chars + , foldMap (Set.fromList . decodeCharacterClass) ccs + , foldMap (Set.fromList . unSEC) eqcs + ] -- | This returns the strictly ascending list of characters -- represented by @[: :]@ POSIX character classes. diff --git a/lib/Text/Regex/TDFA/ReadRegex.hs b/lib/Text/Regex/TDFA/ReadRegex.hs index c1011fa..f199aed 100644 --- a/lib/Text/Regex/TDFA/ReadRegex.hs +++ b/lib/Text/Regex/TDFA/ReadRegex.hs @@ -14,11 +14,12 @@ import Text.ParserCombinators.Parsec((<|>), (), try, runParser, many, getState, setState, CharParser, ParseError, sepBy1, option, notFollowedBy, many1, lookAhead, eof, between, string, noneOf, digit, char, anyChar) +import Utils import Control.Monad (liftM, guard) -import Data.Foldable (asum) -import qualified Data.Set as Set(fromList) +import Data.Foldable (asum, foldl') +import qualified Data.Set as Set -- | An element inside @[...]@, denoting a character class. data BracketElement @@ -132,20 +133,31 @@ p_bracket :: P Pattern p_bracket = (char '[') >> ( (char '^' >> p_set True) <|> (p_set False) ) p_set :: Bool -> P Pattern -p_set invert = do initial <- option "" (char ']' >> return "]") - values <- if null initial then many1 p_set_elem else many p_set_elem - _ <- char ']' - ci <- char_index - let chars = maybe'set $ concat $ - initial : - [ c | BEChar c <- values ] : - [ [start..end] | BERange start end <- values ] - colls = maybe'set [PatternSetCollatingElement coll | BEColl coll <- values ] - equivs = maybe'set [PatternSetEquivalenceClass equiv | BEEquiv equiv <- values] - class's = maybe'set [PatternSetCharacterClass a'class | BEClass a'class <- values] - maybe'set x = if null x then Nothing else Just (Set.fromList x) - sets = PatternSet chars class's colls equivs - sets `seq` return $ if invert then PAnyNot ci sets else PAny ci sets +p_set invert = do + -- A ] as first character after the opening [ is treated as alternative ']' + -- rather than the closing bracket. + initial <- option mempty $ Set.singleton <$> char ']' + -- Parse remaining content of bracket expression. + values <- if Set.null initial then many1 p_set_elem else many p_set_elem + _ <- char ']' + ci <- char_index + -- Process the content of bracket expression into a PatternSet. + let !sets = foldl' (flip addBracketElement) (mempty{ _patternSetChars = initial }) values + return $ if invert then PAnyNot ci sets else PAny ci sets + +addBracketElement :: BracketElement -> PatternSet -> PatternSet +addBracketElement = \case + BEChar c -> + over patternSetChars $ Set.insert c + BERange start end -> + over patternSetChars $ (`Set.union` Set.fromDistinctAscList [start..end]) + -- Set.union is left-biased, [start..end] is considered the smaller set + BEClass s -> + over patternSetCharacterClasses $ Set.insert $ PatternSetCharacterClass s + BEColl s -> + over patternSetCollatingElements $ Set.insert $ PatternSetCollatingElement s + BEEquiv s -> + over patternSetEquivalenceClasses $ Set.insert $ PatternSetEquivalenceClass s -- From here down the code is the parser and functions for pattern [ ] set things diff --git a/regex-tdfa.cabal b/regex-tdfa.cabal index 3132b8f..092e919 100644 --- a/regex-tdfa.cabal +++ b/regex-tdfa.cabal @@ -37,7 +37,6 @@ tested-with: GHC == 7.10.3 GHC == 7.8.4 GHC == 7.6.3 - GHC == 7.4.2 source-repository head type: git @@ -99,7 +98,8 @@ library build-depends: fail == 4.9.* , semigroups == 0.18.* || == 0.19.* build-depends: array >= 0.4 && < 0.6 - , base >= 4.5 && < 5 + , base >= 4.6 && < 5 + -- GHC 7.6 required for LambdaCase , bytestring >= 0.9.2 && < 0.12 , containers >= 0.4.2 && < 0.7 , mtl >= 2.1.3 && < 2.4 @@ -114,6 +114,7 @@ library FlexibleInstances ForeignFunctionInterface FunctionalDependencies + LambdaCase MagicHash MultiParamTypeClasses NondecreasingIndentation