diff --git a/.github/workflows/haskell-ci.yml b/.github/workflows/haskell-ci.yml index 8a9f826..45e46a6 100644 --- a/.github/workflows/haskell-ci.yml +++ b/.github/workflows/haskell-ci.yml @@ -92,11 +92,6 @@ jobs: compilerVersion: 7.6.3 setup-method: hvr-ppa allow-failure: false - - compiler: ghc-7.4.2 - compilerKind: ghc - compilerVersion: 7.4.2 - setup-method: hvr-ppa - allow-failure: false fail-fast: false steps: - name: apt diff --git a/lib/Text/Regex/TDFA/Common.hs b/lib/Text/Regex/TDFA/Common.hs index a9884d4..6f39b8a 100644 --- a/lib/Text/Regex/TDFA/Common.hs +++ b/lib/Text/Regex/TDFA/Common.hs @@ -21,6 +21,7 @@ import Data.Sequence as S(Seq) --import Debug.Trace import Text.Regex.TDFA.IntArrTrieSet(TrieSet) +import Utils {-# INLINE look #-} look :: Int -> IntMap a -> a @@ -30,43 +31,6 @@ common_error :: String -> String -> a common_error moduleName message = error ("Explict error in module "++moduleName++" : "++message) -on :: (t1 -> t1 -> t2) -> (t -> t1) -> t -> t -> t2 -f `on` g = (\x y -> (g x) `f` (g y)) - --- | After 'sort' or 'sortBy' the use of 'nub' or 'nubBy' can be replaced by 'norep' or 'norepBy'. -norep :: (Eq a) => [a]->[a] -norep [] = [] -norep x@[_] = x -norep (a:bs@(c:cs)) | a==c = norep (a:cs) - | otherwise = a:norep bs - --- | After 'sort' or 'sortBy' the use of 'nub' or 'nubBy' can be replaced by 'norep' or 'norepBy'. -norepBy :: (a -> a -> Bool) -> [a] -> [a] -norepBy _ [] = [] -norepBy _ x@[_] = x -norepBy eqF (a:bs@(c:cs)) | a `eqF` c = norepBy eqF (a:cs) - | otherwise = a:norepBy eqF bs - -mapFst :: (Functor f) => (t -> t2) -> f (t, t1) -> f (t2, t1) -mapFst f = fmap (\ (a,b) -> (f a,b)) - -mapSnd :: (Functor f) => (t1 -> t2) -> f (t, t1) -> f (t, t2) -mapSnd f = fmap (\ (a,b) -> (a,f b)) - -fst3 :: (a,b,c) -> a -fst3 (x,_,_) = x - -snd3 :: (a,b,c) -> b -snd3 (_,x,_) = x - -thd3 :: (a,b,c) -> c -thd3 (_,_,x) = x - -flipOrder :: Ordering -> Ordering -flipOrder GT = LT -flipOrder LT = GT -flipOrder EQ = EQ - noWin :: WinTags -> Bool noWin = null diff --git a/lib/Text/Regex/TDFA/CorePattern.hs b/lib/Text/Regex/TDFA/CorePattern.hs index a5d299f..6e68c77 100644 --- a/lib/Text/Regex/TDFA/CorePattern.hs +++ b/lib/Text/Regex/TDFA/CorePattern.hs @@ -49,6 +49,7 @@ import Data.Semigroup as Sem import Text.Regex.TDFA.Common {- all -} import Text.Regex.TDFA.Pattern(Pattern(..),starTrans) +import Utils -- import Debug.Trace {- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -} diff --git a/lib/Text/Regex/TDFA/Pattern.hs b/lib/Text/Regex/TDFA/Pattern.hs index b6fedf2..29422af 100644 --- a/lib/Text/Regex/TDFA/Pattern.hs +++ b/lib/Text/Regex/TDFA/Pattern.hs @@ -5,18 +5,22 @@ -- the parsed form of a Regular Expression. module Text.Regex.TDFA.Pattern - (Pattern(..) - ,PatternSet(..) - ,PatternSetCharacterClass(..) - ,PatternSetCollatingElement(..) - ,PatternSetEquivalenceClass(..) - ,GroupIndex - ,DoPa(..) - ,showPattern --- ** Internal use - ,starTrans --- ** Internal use, Operations to support debugging under ghci - ,starTrans',simplify',dfsPattern + ( Pattern(..) + , PatternSet(..) + , patternSetChars + , patternSetCharacterClasses + , patternSetCollatingElements + , patternSetEquivalenceClasses + , PatternSetCharacterClass(..) + , PatternSetCollatingElement(..) + , PatternSetEquivalenceClass(..) + , GroupIndex + , DoPa(..) + , showPattern + -- ** Internal use + , starTrans + -- ** Internal use, Operations to support debugging under ghci + , starTrans', simplify', dfsPattern ) where {- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -} @@ -24,6 +28,8 @@ module Text.Regex.TDFA.Pattern import Data.List(intersperse,partition) import qualified Data.Set as Set(toAscList,toList) import Data.Set(Set) -- XXX EnumSet + +import Utils import Text.Regex.TDFA.Common(DoPa(..),GroupIndex,common_error) err :: String -> a @@ -92,19 +98,54 @@ showPattern pIn = -} paren s = ('(':s)++")" -data PatternSet = PatternSet (Maybe (Set Char)) - (Maybe (Set PatternSetCharacterClass)) - (Maybe (Set PatternSetCollatingElement)) - (Maybe (Set PatternSetEquivalenceClass)) - deriving (Eq) +-- | Processed content of a bracket expression. +data PatternSet = PatternSet + { _patternSetChars :: Set Char + -- ^ Characters included in the pattern. + , _patternSetCharacterClasses :: Set PatternSetCharacterClass + -- ^ POSIX character classes included in the pattern. + , _patternSetCollatingElements :: Set PatternSetCollatingElement + -- ^ Collating elements included in the pattern. + , _patternSetEquivalenceClasses :: Set PatternSetEquivalenceClass + -- ^ Equivalence classes included in the pattern. + } + deriving (Eq) + +instance Semigroup PatternSet where + PatternSet a b c d <> PatternSet a' b' c' d' = + PatternSet (a <> a') (b <> b') (c <> c') (d <> d') + +instance Monoid PatternSet where + mempty = PatternSet mempty mempty mempty mempty + mappend = (<>) + +-- | Lens for '_patternSetChars'. +patternSetChars :: Lens' PatternSet (Set Char) +patternSetChars f ps = + f (_patternSetChars ps) <&> \ i -> ps{ _patternSetChars = i } + +-- | Lens for '_patternSetCharacterClasses'. +patternSetCharacterClasses :: Lens' PatternSet (Set PatternSetCharacterClass) +patternSetCharacterClasses f ps = + f (_patternSetCharacterClasses ps) <&> \ i -> ps{ _patternSetCharacterClasses = i } + +-- | Lens for '_patternSetCollatingElements'. +patternSetCollatingElements :: Lens' PatternSet (Set PatternSetCollatingElement) +patternSetCollatingElements f ps = + f (_patternSetCollatingElements ps) <&> \ i -> ps{ _patternSetCollatingElements = i } + +-- | Lens for '_patternSetEquivalenceClasses'. +patternSetEquivalenceClasses :: Lens' PatternSet (Set PatternSetEquivalenceClass) +patternSetEquivalenceClasses f ps = + f (_patternSetEquivalenceClasses ps) <&> \ i -> ps{ _patternSetEquivalenceClasses = i } instance Show PatternSet where showsPrec i (PatternSet s scc sce sec) = - let (special,normal) = maybe ("","") ((partition (`elem` "]-")) . Set.toAscList) s + let (special,normal) = partition (`elem` "]-") $ Set.toAscList s charSpec = (if ']' `elem` special then (']':) else id) (byRange normal) - scc' = maybe "" ((concatMap show) . Set.toList) scc - sce' = maybe "" ((concatMap show) . Set.toList) sce - sec' = maybe "" ((concatMap show) . Set.toList) sec + scc' = concatMap show $ Set.toList scc + sce' = concatMap show $ Set.toList sce + sec' = concatMap show $ Set.toList sec in shows charSpec . showsPrec i scc' . showsPrec i sce' . showsPrec i sec' . if '-' `elem` special then showChar '-' else id diff --git a/lib/Text/Regex/TDFA/ReadRegex.hs b/lib/Text/Regex/TDFA/ReadRegex.hs index f29c46f..087fee6 100644 --- a/lib/Text/Regex/TDFA/ReadRegex.hs +++ b/lib/Text/Regex/TDFA/ReadRegex.hs @@ -14,11 +14,12 @@ import Text.ParserCombinators.Parsec((<|>), (), try, runParser, many, getState, setState, CharParser, ParseError, sepBy1, option, notFollowedBy, many1, lookAhead, eof, between, string, noneOf, digit, char, anyChar) +import Utils import Control.Monad (liftM, guard) -import Data.Foldable (asum) -import qualified Data.Set as Set(fromList) +import Data.Foldable (asum, foldl') +import qualified Data.Set as Set -- | An element inside @[...]@, denoting a character class. data BracketElement @@ -41,12 +42,12 @@ parseRegex x = runParser (do pat <- p_regex type P = CharParser (GroupIndex, Int) p_regex :: P Pattern -p_regex = liftM POr $ sepBy1 p_branch (char '|') +p_regex = POr <$> sepBy1 p_branch (char '|') -- man re_format helps a lot, it says one-or-more pieces so this is -- many1 not many. Use "()" to indicate an empty piece. p_branch :: P Pattern -p_branch = liftM PConcat $ many1 p_piece +p_branch = PConcat <$> many1 p_piece p_piece :: P Pattern p_piece = (p_anchor <|> p_atom) >>= p_post_atom -- correct specification @@ -62,15 +63,15 @@ group_index = do return (Just index) p_group :: P Pattern -p_group = lookAhead (char '(') >> do - index <- group_index - liftM (PGroup index) $ between (char '(') (char ')') p_regex +p_group = do + _ <- lookAhead (char '(') + PGroup <$> group_index <*> between (char '(') (char ')') p_regex -- p_post_atom takes the previous atom as a parameter p_post_atom :: Pattern -> P Pattern -p_post_atom atom = (char '?' >> return (PQuest atom)) - <|> (char '+' >> return (PPlus atom)) - <|> (char '*' >> return (PStar True atom)) +p_post_atom atom = (char '?' $> PQuest atom) + <|> (char '+' $> PPlus atom) + <|> (char '*' $> PStar True atom) <|> p_bound atom <|> return atom @@ -78,19 +79,20 @@ p_bound :: Pattern -> P Pattern p_bound atom = try $ between (char '{') (char '}') (p_bound_spec atom) p_bound_spec :: Pattern -> P Pattern -p_bound_spec atom = do lowS <- many1 digit - let lowI = read lowS - highMI <- option (Just lowI) $ try $ do - _ <- char ',' - -- parsec note: if 'many digits' fails below then the 'try' ensures - -- that the ',' will not match the closing '}' in p_bound, same goes - -- for any non '}' garbage after the 'many digits'. - highS <- many digit - if null highS then return Nothing -- no upper bound - else do let highI = read highS - guard (lowI <= highI) - return (Just (read highS)) - return (PBound lowI highMI atom) +p_bound_spec atom = do + lowI <- read <$> many1 digit + highMI <- option (Just lowI) $ try $ do + _ <- char ',' + -- parsec note: if 'many digits' fails below then the 'try' ensures + -- that the ',' will not match the closing '}' in p_bound, same goes + -- for any non '}' garbage after the 'many digits'. + highS <- many digit + if null highS then return Nothing -- no upper bound + else do + let highI = read highS + guard (lowI <= highI) + return $ Just highI + return $ PBound lowI highMI atom -- An anchor cannot be modified by a repetition specifier p_anchor :: P Pattern @@ -102,38 +104,60 @@ p_anchor = (char '^' >> liftM PCarat char_index) "empty () or anchor ^ or $" char_index :: P DoPa -char_index = do (gi,ci) <- getState - let ci' = succ ci - setState (gi,ci') - return (DoPa ci') +char_index = do + (gi, ci) <- getState + let ci' = succ ci + setState (gi, ci') + return $ DoPa ci' p_char :: P Pattern -p_char = p_dot <|> p_left_brace <|> p_escaped <|> p_other_char where - p_dot = char '.' >> char_index >>= return . PDot - p_left_brace = try $ (char '{' >> notFollowedBy digit >> char_index >>= return . (`PChar` '{')) - p_escaped = char '\\' >> anyChar >>= \c -> char_index >>= return . (`PEscape` c) - p_other_char = noneOf specials >>= \c -> char_index >>= return . (`PChar` c) - where specials = "^.[$()|*+?{\\" +p_char = p_dot <|> p_left_brace <|> p_escaped <|> p_other_char + where + p_dot = do + _ <- char '.' + PDot <$> char_index + + p_left_brace = try $ do + _ <- char '{' + _ <- notFollowedBy digit + flip PChar '{' <$> char_index + + p_escaped = do + _ <- char '\\' + flip PEscape <$> anyChar <*> char_index + + p_other_char = flip PChar <$> noneOf "^.[$()|*+?{\\" <*> char_index -- parse [bar] and [^bar] sets of characters p_bracket :: P Pattern p_bracket = (char '[') >> ( (char '^' >> p_set True) <|> (p_set False) ) p_set :: Bool -> P Pattern -p_set invert = do initial <- option "" (char ']' >> return "]") - values <- if null initial then many1 p_set_elem else many p_set_elem - _ <- char ']' - ci <- char_index - let chars = maybe'set $ concat $ - initial : - [ c | BEChar c <- values ] : - [ [start..end] | BERange start end <- values ] - colls = maybe'set [PatternSetCollatingElement coll | BEColl coll <- values ] - equivs = maybe'set [PatternSetEquivalenceClass equiv | BEEquiv equiv <- values] - class's = maybe'set [PatternSetCharacterClass a'class | BEClass a'class <- values] - maybe'set x = if null x then Nothing else Just (Set.fromList x) - sets = PatternSet chars class's colls equivs - sets `seq` return $ if invert then PAnyNot ci sets else PAny ci sets +p_set invert = do + -- A ] as first character after the opening [ is treated as alternative ']' + -- rather than the closing bracket. + initial <- option mempty $ Set.singleton <$> char ']' + -- Parse remaining content of bracket expression. + values <- if Set.null initial then many1 p_set_elem else many p_set_elem + _ <- char ']' + ci <- char_index + -- Process the content of bracket expression into a PatternSet. + let !sets = foldl' (flip addBracketElement) (mempty{ _patternSetChars = initial }) values + return $ if invert then PAnyNot ci sets else PAny ci sets + +addBracketElement :: BracketElement -> PatternSet -> PatternSet +addBracketElement = \case + BEChar c -> + over patternSetChars $ Set.insert c + BERange start end -> + over patternSetChars $ (`Set.union` Set.fromDistinctAscList [start..end]) + -- Set.union is left-biased, [start..end] is considered the smaller set + BEClass s -> + over patternSetCharacterClasses $ Set.insert $ PatternSetCharacterClass s + BEColl s -> + over patternSetCollatingElements $ Set.insert $ PatternSetCollatingElement s + BEEquiv s -> + over patternSetEquivalenceClasses $ Set.insert $ PatternSetEquivalenceClass s -- From here down the code is the parser and functions for pattern [ ] set things @@ -161,15 +185,13 @@ p_set_elem_coll = liftM BEColl $ p_set_elem_range :: P BracketElement p_set_elem_range = try $ do - start <- noneOf "]" - _ <- char '-' - end <- noneOf "]" + start <- noneOf "]-" + _ <- char '-' + end <- noneOf "]" return $ BERange start end p_set_elem_char :: P BracketElement -p_set_elem_char = do - c <- noneOf "]" - return (BEChar c) +p_set_elem_char = BEChar <$> noneOf "]" -- | Fail when 'BracketElement' is invalid, e.g. empty range @1-0@. -- This failure should not be caught. diff --git a/lib/Text/Regex/TDFA/TDFA.hs b/lib/Text/Regex/TDFA/TDFA.hs index b45efbe..a45ae3e 100644 --- a/lib/Text/Regex/TDFA/TDFA.hs +++ b/lib/Text/Regex/TDFA/TDFA.hs @@ -28,6 +28,7 @@ import qualified Text.Regex.TDFA.IntArrTrieSet as Trie(lookupAsc,fromSinglesMerg import Text.Regex.TDFA.Pattern(Pattern) --import Text.Regex.TDFA.RunMutState(toInstructions) import Text.Regex.TDFA.TNFA(patternToNFA) +import Utils --import Debug.Trace {- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -} diff --git a/lib/Text/Regex/TDFA/TNFA.hs b/lib/Text/Regex/TDFA/TNFA.hs index 9fa6437..baa3aef 100644 --- a/lib/Text/Regex/TDFA/TNFA.hs +++ b/lib/Text/Regex/TDFA/TNFA.hs @@ -29,8 +29,11 @@ -- -- Uses recursive do notation. -module Text.Regex.TDFA.TNFA(patternToNFA - ,QNFA(..),QT(..),QTrans,TagUpdate(..)) where +module Text.Regex.TDFA.TNFA + ( patternToNFA + , decodeCharacterClass, decodePatternSet + , QNFA(..), QT(..), QTrans, TagUpdate(..) + ) where {- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -} @@ -48,16 +51,17 @@ import Data.IntSet.EnumSet2(EnumSet) import qualified Data.IntSet.EnumSet2 as Set(singleton,toList,insert) import Data.Maybe(catMaybes,isNothing) import Data.Monoid as Mon(Monoid(..)) -import qualified Data.Set as S(Set,insert,toAscList,empty) +import qualified Data.Set as S import Text.Regex.TDFA.Common(QT(..),QNFA(..),QTrans,TagTask(..),TagUpdate(..),DoPa(..) ,CompOption(..) ,Tag,TagTasks,TagList,Index,WinTags,GroupIndex,GroupInfo(..) - ,common_error,noWin,snd3,mapSnd) + ,common_error,noWin) import Text.Regex.TDFA.CorePattern(Q(..),P(..),OP(..),WhichTest,cleanNullView,NullView ,SetTestInfo(..),Wanted(..),TestInfo ,mustAccept,cannotAccept,patternToQ) import Text.Regex.TDFA.Pattern(Pattern(..),PatternSet(..),unSEC,PatternSetCharacterClass(..)) +import Utils --import Debug.Trace ecart :: String -> a -> a @@ -786,18 +790,18 @@ ADD ORPHAN ID check and make this a fatal error while testing -} --- | decodePatternSet cannot handle collating element and treats +-- | @decodePatternSet@ cannot handle collating element and treats -- equivalence classes as just their definition and nothing more. decodePatternSet :: PatternSet -> S.Set Char -decodePatternSet (PatternSet msc mscc _ msec) = - let baseMSC = maybe S.empty id msc - withMSCC = foldl (flip S.insert) baseMSC (maybe [] (concatMap decodeCharacterClass . S.toAscList) mscc) - withMSEC = foldl (flip S.insert) withMSCC (maybe [] (concatMap unSEC . S.toAscList) msec) - in withMSEC - --- | This returns the distinct ascending list of characters --- represented by [: :] values in legalCharacterClasses; unrecognized --- class names return an empty string +decodePatternSet (PatternSet chars ccs _ eqcs) = S.unions + [ chars + , foldMap (S.fromList . decodeCharacterClass) ccs + , foldMap (S.fromList . unSEC) eqcs + ] + +-- | This returns the strictly ascending list of characters +-- represented by @[: :]@ POSIX character classes. +-- Unrecognized class names return an empty string. decodeCharacterClass :: PatternSetCharacterClass -> String decodeCharacterClass (PatternSetCharacterClass s) = case s of diff --git a/lib/Utils.hs b/lib/Utils.hs new file mode 100644 index 0000000..acedb4f --- /dev/null +++ b/lib/Utils.hs @@ -0,0 +1,75 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE RankNTypes #-} + +{-# OPTIONS_GHC -fno-warn-name-shadowing #-} + +-- | Internal module for utilities used in the implementation. + +module Utils (module Utils, module X) where + +import Control.Applicative (Const(..)) +import Control.Applicative as X ((<*>)) +import Data.Functor as X +import Data.Functor.Identity + +-- * Lenses +--------------------------------------------------------------------------- + +type Lens' o i = forall f. Functor f => (i -> f i) -> (o -> f o) + +type LensGet o i = o -> i +type LensSet o i = i -> o -> o +type LensMap o i = (i -> i) -> o -> o + +infixl 8 ^. +-- | Get inner part @i@ of structure @o@ as designated by @Lens' o i@. +(^.) :: o -> Lens' o i -> i +o ^. l = getConst $ l Const o + +-- | Set inner part @i@ of structure @o@ as designated by @Lens' o i@. +set :: Lens' o i -> LensSet o i +set l = over l . const + +-- | Modify inner part @i@ of structure @o@ using a function @i -> i@. +over :: Lens' o i -> LensMap o i +over l f o = runIdentity $ l (Identity . f) o + +-- * Misc +--------------------------------------------------------------------------- + +#if !MIN_VERSION_base(4,11,0) +(<&>) :: Functor f => f a -> (a -> b) -> f b +(<&>) = flip fmap +#endif + +-- | After 'sort' or 'sortBy' the use of 'nub' or 'nubBy' can be replaced by 'norep' or 'norepBy'. +norep :: Eq a => [a] -> [a] +norep = norepBy (==) + +-- | After 'sort' or 'sortBy' the use of 'nub' or 'nubBy' can be replaced by 'norep' or 'norepBy'. +norepBy :: (a -> a -> Bool) -> [a] -> [a] +norepBy _ [] = [] +norepBy eq (a:as) = loop a as + where + loop a [] = [a] + loop a (b:bs) = (if a `eq` b then id else (a:)) $ loop b bs + +mapFst :: Functor f => (t1 -> t2) -> f (t1, t) -> f (t2, t) +mapFst f = fmap $ \ (a, b) -> (f a, b) + +mapSnd :: Functor f => (t1 -> t2) -> f (t, t1) -> f (t, t2) +mapSnd f = fmap $ \ (a, b) -> (a, f b) + +fst3 :: (a,b,c) -> a +fst3 (x,_,_) = x + +snd3 :: (a,b,c) -> b +snd3 (_,x,_) = x + +thd3 :: (a,b,c) -> c +thd3 (_,_,x) = x + +flipOrder :: Ordering -> Ordering +flipOrder GT = LT +flipOrder LT = GT +flipOrder EQ = EQ diff --git a/regex-tdfa.cabal b/regex-tdfa.cabal index b3b8089..082031a 100644 --- a/regex-tdfa.cabal +++ b/regex-tdfa.cabal @@ -37,7 +37,6 @@ tested-with: GHC == 7.10.3 GHC == 7.8.4 GHC == 7.6.3 - GHC == 7.4.2 source-repository head type: git @@ -84,7 +83,8 @@ library Text.Regex.TDFA.Text Text.Regex.TDFA.Text.Lazy - other-modules: Paths_regex_tdfa + other-modules: Utils + Paths_regex_tdfa -- Support Semigroup instances uniformly -- @@ -98,7 +98,8 @@ library build-depends: fail == 4.9.* , semigroups == 0.18.* || == 0.19.* build-depends: array >= 0.4 && < 0.6 - , base >= 4.5 && < 5 + , base >= 4.6 && < 5 + -- GHC 7.6 required for LambdaCase , bytestring >= 0.9.2 && < 0.12 , containers >= 0.4.2 && < 0.7 , mtl >= 2.1.3 && < 2.4 @@ -113,6 +114,7 @@ library FlexibleInstances ForeignFunctionInterface FunctionalDependencies + LambdaCase MagicHash MultiParamTypeClasses NondecreasingIndentation @@ -123,6 +125,7 @@ library UnboxedTuples UnliftedFFITypes other-extensions: CPP + RankNTypes ghc-options: -Wall -funbox-strict-fields -fspec-constr-count=10 -fno-warn-orphans