haskell-hvr · andreasabel · Jul 18, 2022 · Jul 18, 2022 · Jul 18, 2022 · Jul 18, 2022
diff --git a/.github/workflows/haskell-ci.yml b/.github/workflows/haskell-ci.yml
@@ -92,11 +92,6 @@ jobs:
             compilerVersion: 7.6.3
             setup-method: hvr-ppa
             allow-failure: false
-          - compiler: ghc-7.4.2
-            compilerKind: ghc
-            compilerVersion: 7.4.2
-            setup-method: hvr-ppa
-            allow-failure: false
       fail-fast: false
     steps:
       - name: apt

diff --git a/lib/Text/Regex/TDFA/Common.hs b/lib/Text/Regex/TDFA/Common.hs
@@ -21,6 +21,7 @@ import Data.Sequence as S(Seq)
 --import Debug.Trace
 
 import Text.Regex.TDFA.IntArrTrieSet(TrieSet)
+import Utils
 
 {-# INLINE look #-}
 look :: Int -> IntMap a -> a
@@ -30,43 +31,6 @@ common_error :: String -> String -> a
 common_error moduleName message =
   error ("Explict error in module "++moduleName++" : "++message)
 
-on :: (t1 -> t1 -> t2) -> (t -> t1) -> t -> t -> t2
-f `on` g = (\x y -> (g x) `f` (g y))
-
--- | After 'sort' or 'sortBy' the use of 'nub' or 'nubBy' can be replaced by 'norep' or 'norepBy'.
-norep :: (Eq a) => [a]->[a]
-norep [] = []
-norep x@[_] = x
-norep (a:bs@(c:cs)) | a==c = norep (a:cs)
-                    | otherwise = a:norep bs
-
--- | After 'sort' or 'sortBy' the use of 'nub' or 'nubBy' can be replaced by 'norep' or 'norepBy'.
-norepBy :: (a -> a -> Bool) -> [a] -> [a]
-norepBy _ [] = []
-norepBy _ x@[_] = x
-norepBy eqF (a:bs@(c:cs)) | a `eqF` c = norepBy eqF (a:cs)
-                          | otherwise = a:norepBy eqF bs
-
-mapFst :: (Functor f) => (t -> t2) -> f (t, t1) -> f (t2, t1)
-mapFst f = fmap (\ (a,b) -> (f a,b))
-
-mapSnd :: (Functor f) => (t1 -> t2) -> f (t, t1) -> f (t, t2)
-mapSnd f = fmap (\ (a,b) -> (a,f b))
-
-fst3 :: (a,b,c) -> a
-fst3 (x,_,_) = x
-
-snd3 :: (a,b,c) -> b
-snd3 (_,x,_) = x
-
-thd3 :: (a,b,c) -> c
-thd3 (_,_,x) = x
-
-flipOrder :: Ordering -> Ordering
-flipOrder GT = LT
-flipOrder LT = GT
-flipOrder EQ = EQ
-
 noWin :: WinTags -> Bool
 noWin = null
 

diff --git a/lib/Text/Regex/TDFA/CorePattern.hs b/lib/Text/Regex/TDFA/CorePattern.hs
@@ -49,6 +49,7 @@ import Data.Semigroup as Sem
 
 import Text.Regex.TDFA.Common {- all -}
 import Text.Regex.TDFA.Pattern(Pattern(..),starTrans)
+import Utils
 -- import Debug.Trace
 
 {- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}

diff --git a/lib/Text/Regex/TDFA/Pattern.hs b/lib/Text/Regex/TDFA/Pattern.hs
@@ -5,25 +5,31 @@
 -- the parsed form of a Regular Expression.
 
 module Text.Regex.TDFA.Pattern
-    (Pattern(..)
-    ,PatternSet(..)
-    ,PatternSetCharacterClass(..)
-    ,PatternSetCollatingElement(..)
-    ,PatternSetEquivalenceClass(..)
-    ,GroupIndex
-    ,DoPa(..)
-    ,showPattern
--- ** Internal use
-    ,starTrans
--- ** Internal use, Operations to support debugging under ghci
-    ,starTrans',simplify',dfsPattern
+    ( Pattern(..)
+    , PatternSet(..)
+    , patternSetChars
+    , patternSetCharacterClasses
+    , patternSetCollatingElements
+    , patternSetEquivalenceClasses
+    , PatternSetCharacterClass(..)
+    , PatternSetCollatingElement(..)
+    , PatternSetEquivalenceClass(..)
+    , GroupIndex
+    , DoPa(..)
+    , showPattern
+      -- ** Internal use
+    , starTrans
+      -- ** Internal use, Operations to support debugging under ghci
+    , starTrans', simplify', dfsPattern
     ) where
 
 {- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}
 
 import Data.List(intersperse,partition)
 import qualified Data.Set as Set(toAscList,toList)
 import Data.Set(Set) -- XXX EnumSet
+
+import Utils
 import Text.Regex.TDFA.Common(DoPa(..),GroupIndex,common_error)
 
 err :: String -> a
@@ -92,19 +98,54 @@ showPattern pIn =
 -}
         paren s = ('(':s)++")"
 
-data PatternSet = PatternSet (Maybe (Set Char))
-                             (Maybe (Set PatternSetCharacterClass))
-                             (Maybe (Set PatternSetCollatingElement))
-                             (Maybe (Set PatternSetEquivalenceClass))
-                             deriving (Eq)
+-- | Processed content of a bracket expression.
+data PatternSet = PatternSet
+  { _patternSetChars              :: Set Char
+      -- ^ Characters included in the pattern.
+  , _patternSetCharacterClasses   :: Set PatternSetCharacterClass
+      -- ^ POSIX character classes included in the pattern.
+  , _patternSetCollatingElements  :: Set PatternSetCollatingElement
+      -- ^ Collating elements included in the pattern.
+  , _patternSetEquivalenceClasses :: Set PatternSetEquivalenceClass
+      -- ^ Equivalence classes included in the pattern.
+  }
+  deriving (Eq)
+
+instance Semigroup PatternSet where
+  PatternSet a b c d <> PatternSet a' b' c' d' =
+   PatternSet (a <> a') (b <> b') (c <> c') (d <> d')
+
+instance Monoid PatternSet where
+  mempty  = PatternSet mempty mempty mempty mempty
+  mappend = (<>)
+
+-- | Lens for '_patternSetChars'.
+patternSetChars :: Lens' PatternSet (Set Char)
+patternSetChars f ps =
+  f (_patternSetChars ps) <&> \ i -> ps{ _patternSetChars = i }
+
+-- | Lens for '_patternSetCharacterClasses'.
+patternSetCharacterClasses :: Lens' PatternSet (Set PatternSetCharacterClass)
+patternSetCharacterClasses f ps =
+  f (_patternSetCharacterClasses ps) <&> \ i -> ps{ _patternSetCharacterClasses = i }
+
+-- | Lens for '_patternSetCollatingElements'.
+patternSetCollatingElements :: Lens' PatternSet (Set PatternSetCollatingElement)
+patternSetCollatingElements f ps =
+  f (_patternSetCollatingElements ps) <&> \ i -> ps{ _patternSetCollatingElements = i }
+
+-- | Lens for '_patternSetEquivalenceClasses'.
+patternSetEquivalenceClasses :: Lens' PatternSet (Set PatternSetEquivalenceClass)
+patternSetEquivalenceClasses f ps =
+  f (_patternSetEquivalenceClasses ps) <&> \ i -> ps{ _patternSetEquivalenceClasses = i }
 
 instance Show PatternSet where
   showsPrec i (PatternSet s scc sce sec) =
-    let (special,normal) = maybe ("","") ((partition (`elem` "]-")) . Set.toAscList) s
+    let (special,normal) = partition (`elem` "]-") $ Set.toAscList s
         charSpec = (if ']' `elem` special then (']':) else id) (byRange normal)
-        scc' = maybe "" ((concatMap show) . Set.toList) scc
-        sce' = maybe "" ((concatMap show) . Set.toList) sce
-        sec' = maybe "" ((concatMap show) . Set.toList) sec
+        scc' = concatMap show $ Set.toList scc
+        sce' = concatMap show $ Set.toList sce
+        sec' = concatMap show $ Set.toList sec
     in shows charSpec
        . showsPrec i scc' . showsPrec i sce' . showsPrec i sec'
        . if '-' `elem` special then showChar '-' else id

diff --git a/lib/Text/Regex/TDFA/ReadRegex.hs b/lib/Text/Regex/TDFA/ReadRegex.hs
@@ -14,11 +14,12 @@ import Text.ParserCombinators.Parsec((<|>), (<?>),
   try, runParser, many, getState, setState, CharParser, ParseError,
   sepBy1, option, notFollowedBy, many1, lookAhead, eof, between,
   string, noneOf, digit, char, anyChar)
+import Utils
 
 import Control.Monad (liftM, guard)
 
-import Data.Foldable (asum)
-import qualified Data.Set as Set(fromList)
+import Data.Foldable (asum, foldl')
+import qualified Data.Set as Set
 
 -- | An element inside @[...]@, denoting a character class.
 data BracketElement
@@ -41,12 +42,12 @@ parseRegex x = runParser (do pat <- p_regex
 type P = CharParser (GroupIndex, Int)
 
 p_regex :: P Pattern
-p_regex = liftM POr $ sepBy1 p_branch (char '|')
+p_regex = POr <$> sepBy1 p_branch (char '|')
 
 -- man re_format helps a lot, it says one-or-more pieces so this is
 -- many1 not many.  Use "()" to indicate an empty piece.
 p_branch :: P Pattern
-p_branch = liftM PConcat $ many1 p_piece
+p_branch = PConcat <$> many1 p_piece
 
 p_piece :: P Pattern
 p_piece = (p_anchor <|> p_atom) >>= p_post_atom -- correct specification
@@ -62,35 +63,36 @@ group_index = do
   return (Just index)
 
 p_group :: P Pattern
-p_group = lookAhead (char '(') >> do
-  index <- group_index
-  liftM (PGroup index) $ between (char '(') (char ')') p_regex
+p_group = do
+  _ <- lookAhead (char '(')
+  PGroup <$> group_index <*> between (char '(') (char ')') p_regex
 
 -- p_post_atom takes the previous atom as a parameter
 p_post_atom :: Pattern -> P Pattern
-p_post_atom atom = (char '?' >> return (PQuest atom))
-               <|> (char '+' >> return (PPlus atom))
-               <|> (char '*' >> return (PStar True atom))
+p_post_atom atom = (char '?' $> PQuest atom)
+               <|> (char '+' $> PPlus atom)
+               <|> (char '*' $> PStar True atom)
                <|> p_bound atom
                <|> return atom
 
 p_bound :: Pattern -> P Pattern
 p_bound atom = try $ between (char '{') (char '}') (p_bound_spec atom)
 
 p_bound_spec :: Pattern -> P Pattern
-p_bound_spec atom = do lowS <- many1 digit
-                       let lowI = read lowS
-                       highMI <- option (Just lowI) $ try $ do
-                                   _ <- char ','
-  -- parsec note: if 'many digits' fails below then the 'try' ensures
-  -- that the ',' will not match the closing '}' in p_bound, same goes
-  -- for any non '}' garbage after the 'many digits'.
-                                   highS <- many digit
-                                   if null highS then return Nothing -- no upper bound
-                                     else do let highI = read highS
-                                             guard (lowI <= highI)
-                                             return (Just (read highS))
-                       return (PBound lowI highMI atom)
+p_bound_spec atom = do
+  lowI <- read <$> many1 digit
+  highMI <- option (Just lowI) $ try $ do
+    _ <- char ','
+    -- parsec note: if 'many digits' fails below then the 'try' ensures
+    -- that the ',' will not match the closing '}' in p_bound, same goes
+    -- for any non '}' garbage after the 'many digits'.
+    highS <- many digit
+    if null highS then return Nothing -- no upper bound
+    else do
+      let highI = read highS
+      guard (lowI <= highI)
+      return $ Just highI
+  return $ PBound lowI highMI atom
 
 -- An anchor cannot be modified by a repetition specifier
 p_anchor :: P Pattern
@@ -102,38 +104,60 @@ p_anchor = (char '^' >> liftM PCarat char_index)
        <?> "empty () or anchor ^ or $"
 
 char_index :: P DoPa
-char_index = do (gi,ci) <- getState
-                let ci' = succ ci
-                setState (gi,ci')
-                return (DoPa ci')
+char_index = do
+  (gi, ci) <- getState
+  let ci' = succ ci
+  setState (gi, ci')
+  return $ DoPa ci'
 
 p_char :: P Pattern
-p_char = p_dot <|> p_left_brace <|> p_escaped <|> p_other_char where
-  p_dot = char '.' >> char_index >>= return . PDot
-  p_left_brace = try $ (char '{' >> notFollowedBy digit >> char_index >>= return . (`PChar` '{'))
-  p_escaped = char '\\' >> anyChar >>= \c -> char_index >>= return . (`PEscape` c)
-  p_other_char = noneOf specials >>= \c -> char_index >>= return . (`PChar` c)
-    where specials  = "^.[$()|*+?{\\"
+p_char = p_dot <|> p_left_brace <|> p_escaped <|> p_other_char
+  where
+  p_dot = do
+    _ <- char '.'
+    PDot <$> char_index
+
+  p_left_brace = try $ do
+    _ <- char '{'
+    _ <- notFollowedBy digit
+    flip PChar '{' <$> char_index
+
+  p_escaped = do
+    _ <- char '\\'
+    flip PEscape <$> anyChar <*> char_index
+
+  p_other_char = flip PChar <$> noneOf "^.[$()|*+?{\\" <*> char_index
 
 -- parse [bar] and [^bar] sets of characters
 p_bracket :: P Pattern
 p_bracket = (char '[') >> ( (char '^' >> p_set True) <|> (p_set False) )
 
 p_set :: Bool -> P Pattern
-p_set invert = do initial <- option "" (char ']' >> return "]")
-                  values <- if null initial then many1 p_set_elem else many p_set_elem
-                  _ <- char ']'
-                  ci <- char_index
-                  let chars = maybe'set $ concat $
-                        initial :
-                        [ c | BEChar c <- values ] :
-                        [ [start..end] | BERange start end <- values ]
-                      colls = maybe'set [PatternSetCollatingElement coll | BEColl coll <- values ]
-                      equivs = maybe'set [PatternSetEquivalenceClass equiv | BEEquiv equiv <- values]
-                      class's = maybe'set [PatternSetCharacterClass a'class | BEClass a'class <- values]
-                      maybe'set x = if null x then Nothing else Just (Set.fromList x)
-                      sets = PatternSet chars class's colls equivs
-                  sets `seq` return $ if invert then PAnyNot ci sets else PAny ci sets
+p_set invert = do
+  -- A ] as first character after the opening [ is treated as alternative ']'
+  -- rather than the closing bracket.
+  initial <- option mempty $ Set.singleton <$> char ']'
+  -- Parse remaining content of bracket expression.
+  values  <- if Set.null initial then many1 p_set_elem else many p_set_elem
+  _       <- char ']'
+  ci      <- char_index
+  -- Process the content of bracket expression into a PatternSet.
+  let !sets = foldl' (flip addBracketElement) (mempty{ _patternSetChars = initial }) values
+  return $ if invert then PAnyNot ci sets else PAny ci sets
+
+addBracketElement :: BracketElement -> PatternSet -> PatternSet
+addBracketElement = \case
+  BEChar  c         ->
+    over patternSetChars $ Set.insert c
+  BERange start end ->
+    over patternSetChars $ (`Set.union` Set.fromDistinctAscList [start..end])
+      -- Set.union is left-biased, [start..end] is considered the smaller set
+  BEClass s ->
+    over patternSetCharacterClasses $ Set.insert $ PatternSetCharacterClass s
+  BEColl  s ->
+    over patternSetCollatingElements $ Set.insert $ PatternSetCollatingElement s
+  BEEquiv s ->
+    over patternSetEquivalenceClasses $ Set.insert $ PatternSetEquivalenceClass s
 
 -- From here down the code is the parser and functions for pattern [ ] set things
 
@@ -161,15 +185,13 @@ p_set_elem_coll =  liftM BEColl $
 
 p_set_elem_range :: P BracketElement
 p_set_elem_range = try $ do
-  start <- noneOf "]"
-  _  <- char '-'
-  end <- noneOf "]"
+  start <- noneOf "]-"
+  _     <- char '-'
+  end   <- noneOf "]"
   return $ BERange start end
 
 p_set_elem_char :: P BracketElement
-p_set_elem_char = do
-  c <- noneOf "]"
-  return (BEChar c)
+p_set_elem_char = BEChar <$> noneOf "]"
 
 -- | Fail when 'BracketElement' is invalid, e.g. empty range @1-0@.
 -- This failure should not be caught.

diff --git a/lib/Text/Regex/TDFA/TDFA.hs b/lib/Text/Regex/TDFA/TDFA.hs
@@ -28,6 +28,7 @@ import qualified Text.Regex.TDFA.IntArrTrieSet as Trie(lookupAsc,fromSinglesMerg
 import Text.Regex.TDFA.Pattern(Pattern)
 --import Text.Regex.TDFA.RunMutState(toInstructions)
 import Text.Regex.TDFA.TNFA(patternToNFA)
+import Utils
 --import Debug.Trace
 
 {- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}