Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Fixes for #16 and #43 #44

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .github/workflows/haskell-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,6 @@ jobs:
compilerVersion: 7.6.3
setup-method: hvr-ppa
allow-failure: false
- compiler: ghc-7.4.2
compilerKind: ghc
compilerVersion: 7.4.2
setup-method: hvr-ppa
allow-failure: false
fail-fast: false
steps:
- name: apt
Expand Down
38 changes: 1 addition & 37 deletions lib/Text/Regex/TDFA/Common.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import Data.Sequence as S(Seq)
--import Debug.Trace

import Text.Regex.TDFA.IntArrTrieSet(TrieSet)
import Utils

{-# INLINE look #-}
look :: Int -> IntMap a -> a
Expand All @@ -30,43 +31,6 @@ common_error :: String -> String -> a
common_error moduleName message =
error ("Explict error in module "++moduleName++" : "++message)

on :: (t1 -> t1 -> t2) -> (t -> t1) -> t -> t -> t2
f `on` g = (\x y -> (g x) `f` (g y))

-- | After 'sort' or 'sortBy' the use of 'nub' or 'nubBy' can be replaced by 'norep' or 'norepBy'.
norep :: (Eq a) => [a]->[a]
norep [] = []
norep x@[_] = x
norep (a:bs@(c:cs)) | a==c = norep (a:cs)
| otherwise = a:norep bs

-- | After 'sort' or 'sortBy' the use of 'nub' or 'nubBy' can be replaced by 'norep' or 'norepBy'.
norepBy :: (a -> a -> Bool) -> [a] -> [a]
norepBy _ [] = []
norepBy _ x@[_] = x
norepBy eqF (a:bs@(c:cs)) | a `eqF` c = norepBy eqF (a:cs)
| otherwise = a:norepBy eqF bs

mapFst :: (Functor f) => (t -> t2) -> f (t, t1) -> f (t2, t1)
mapFst f = fmap (\ (a,b) -> (f a,b))

mapSnd :: (Functor f) => (t1 -> t2) -> f (t, t1) -> f (t, t2)
mapSnd f = fmap (\ (a,b) -> (a,f b))

fst3 :: (a,b,c) -> a
fst3 (x,_,_) = x

snd3 :: (a,b,c) -> b
snd3 (_,x,_) = x

thd3 :: (a,b,c) -> c
thd3 (_,_,x) = x

flipOrder :: Ordering -> Ordering
flipOrder GT = LT
flipOrder LT = GT
flipOrder EQ = EQ

noWin :: WinTags -> Bool
noWin = null

Expand Down
1 change: 1 addition & 0 deletions lib/Text/Regex/TDFA/CorePattern.hs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ import Data.Semigroup as Sem

import Text.Regex.TDFA.Common {- all -}
import Text.Regex.TDFA.Pattern(Pattern(..),starTrans)
import Utils
-- import Debug.Trace

{- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}
Expand Down
83 changes: 62 additions & 21 deletions lib/Text/Regex/TDFA/Pattern.hs
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,31 @@
-- the parsed form of a Regular Expression.

module Text.Regex.TDFA.Pattern
(Pattern(..)
,PatternSet(..)
,PatternSetCharacterClass(..)
,PatternSetCollatingElement(..)
,PatternSetEquivalenceClass(..)
,GroupIndex
,DoPa(..)
,showPattern
-- ** Internal use
,starTrans
-- ** Internal use, Operations to support debugging under ghci
,starTrans',simplify',dfsPattern
( Pattern(..)
, PatternSet(..)
, patternSetChars
, patternSetCharacterClasses
, patternSetCollatingElements
, patternSetEquivalenceClasses
, PatternSetCharacterClass(..)
, PatternSetCollatingElement(..)
, PatternSetEquivalenceClass(..)
, GroupIndex
, DoPa(..)
, showPattern
-- ** Internal use
, starTrans
-- ** Internal use, Operations to support debugging under ghci
, starTrans', simplify', dfsPattern
) where

{- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}

import Data.List(intersperse,partition)
import qualified Data.Set as Set(toAscList,toList)
import Data.Set(Set) -- XXX EnumSet

import Utils
import Text.Regex.TDFA.Common(DoPa(..),GroupIndex,common_error)

err :: String -> a
Expand Down Expand Up @@ -92,19 +98,54 @@ showPattern pIn =
-}
paren s = ('(':s)++")"

data PatternSet = PatternSet (Maybe (Set Char))
(Maybe (Set PatternSetCharacterClass))
(Maybe (Set PatternSetCollatingElement))
(Maybe (Set PatternSetEquivalenceClass))
deriving (Eq)
-- | Processed content of a bracket expression.
data PatternSet = PatternSet
{ _patternSetChars :: Set Char
-- ^ Characters included in the pattern.
, _patternSetCharacterClasses :: Set PatternSetCharacterClass
-- ^ POSIX character classes included in the pattern.
, _patternSetCollatingElements :: Set PatternSetCollatingElement
-- ^ Collating elements included in the pattern.
, _patternSetEquivalenceClasses :: Set PatternSetEquivalenceClass
-- ^ Equivalence classes included in the pattern.
}
deriving (Eq)

instance Semigroup PatternSet where
PatternSet a b c d <> PatternSet a' b' c' d' =
PatternSet (a <> a') (b <> b') (c <> c') (d <> d')

instance Monoid PatternSet where
mempty = PatternSet mempty mempty mempty mempty
mappend = (<>)

-- | Lens for '_patternSetChars'.
patternSetChars :: Lens' PatternSet (Set Char)
patternSetChars f ps =
f (_patternSetChars ps) <&> \ i -> ps{ _patternSetChars = i }

-- | Lens for '_patternSetCharacterClasses'.
patternSetCharacterClasses :: Lens' PatternSet (Set PatternSetCharacterClass)
patternSetCharacterClasses f ps =
f (_patternSetCharacterClasses ps) <&> \ i -> ps{ _patternSetCharacterClasses = i }

-- | Lens for '_patternSetCollatingElements'.
patternSetCollatingElements :: Lens' PatternSet (Set PatternSetCollatingElement)
patternSetCollatingElements f ps =
f (_patternSetCollatingElements ps) <&> \ i -> ps{ _patternSetCollatingElements = i }

-- | Lens for '_patternSetEquivalenceClasses'.
patternSetEquivalenceClasses :: Lens' PatternSet (Set PatternSetEquivalenceClass)
patternSetEquivalenceClasses f ps =
f (_patternSetEquivalenceClasses ps) <&> \ i -> ps{ _patternSetEquivalenceClasses = i }

instance Show PatternSet where
showsPrec i (PatternSet s scc sce sec) =
let (special,normal) = maybe ("","") ((partition (`elem` "]-")) . Set.toAscList) s
let (special,normal) = partition (`elem` "]-") $ Set.toAscList s
charSpec = (if ']' `elem` special then (']':) else id) (byRange normal)
scc' = maybe "" ((concatMap show) . Set.toList) scc
sce' = maybe "" ((concatMap show) . Set.toList) sce
sec' = maybe "" ((concatMap show) . Set.toList) sec
scc' = concatMap show $ Set.toList scc
sce' = concatMap show $ Set.toList sce
sec' = concatMap show $ Set.toList sec
in shows charSpec
. showsPrec i scc' . showsPrec i sce' . showsPrec i sec'
. if '-' `elem` special then showChar '-' else id
Expand Down
128 changes: 75 additions & 53 deletions lib/Text/Regex/TDFA/ReadRegex.hs
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ import Text.ParserCombinators.Parsec((<|>), (<?>),
try, runParser, many, getState, setState, CharParser, ParseError,
sepBy1, option, notFollowedBy, many1, lookAhead, eof, between,
string, noneOf, digit, char, anyChar)
import Utils

import Control.Monad (liftM, guard)

import Data.Foldable (asum)
import qualified Data.Set as Set(fromList)
import Data.Foldable (asum, foldl')
import qualified Data.Set as Set

-- | An element inside @[...]@, denoting a character class.
data BracketElement
Expand All @@ -41,12 +42,12 @@ parseRegex x = runParser (do pat <- p_regex
type P = CharParser (GroupIndex, Int)

p_regex :: P Pattern
p_regex = liftM POr $ sepBy1 p_branch (char '|')
p_regex = POr <$> sepBy1 p_branch (char '|')

-- man re_format helps a lot, it says one-or-more pieces so this is
-- many1 not many. Use "()" to indicate an empty piece.
p_branch :: P Pattern
p_branch = liftM PConcat $ many1 p_piece
p_branch = PConcat <$> many1 p_piece

p_piece :: P Pattern
p_piece = (p_anchor <|> p_atom) >>= p_post_atom -- correct specification
Expand All @@ -62,35 +63,36 @@ group_index = do
return (Just index)

p_group :: P Pattern
p_group = lookAhead (char '(') >> do
index <- group_index
liftM (PGroup index) $ between (char '(') (char ')') p_regex
p_group = do
_ <- lookAhead (char '(')
PGroup <$> group_index <*> between (char '(') (char ')') p_regex

-- p_post_atom takes the previous atom as a parameter
p_post_atom :: Pattern -> P Pattern
p_post_atom atom = (char '?' >> return (PQuest atom))
<|> (char '+' >> return (PPlus atom))
<|> (char '*' >> return (PStar True atom))
p_post_atom atom = (char '?' $> PQuest atom)
<|> (char '+' $> PPlus atom)
<|> (char '*' $> PStar True atom)
<|> p_bound atom
<|> return atom

p_bound :: Pattern -> P Pattern
p_bound atom = try $ between (char '{') (char '}') (p_bound_spec atom)

p_bound_spec :: Pattern -> P Pattern
p_bound_spec atom = do lowS <- many1 digit
let lowI = read lowS
highMI <- option (Just lowI) $ try $ do
_ <- char ','
-- parsec note: if 'many digits' fails below then the 'try' ensures
-- that the ',' will not match the closing '}' in p_bound, same goes
-- for any non '}' garbage after the 'many digits'.
highS <- many digit
if null highS then return Nothing -- no upper bound
else do let highI = read highS
guard (lowI <= highI)
return (Just (read highS))
return (PBound lowI highMI atom)
p_bound_spec atom = do
lowI <- read <$> many1 digit
highMI <- option (Just lowI) $ try $ do
_ <- char ','
-- parsec note: if 'many digits' fails below then the 'try' ensures
-- that the ',' will not match the closing '}' in p_bound, same goes
-- for any non '}' garbage after the 'many digits'.
highS <- many digit
if null highS then return Nothing -- no upper bound
else do
let highI = read highS
guard (lowI <= highI)
return $ Just highI
return $ PBound lowI highMI atom

-- An anchor cannot be modified by a repetition specifier
p_anchor :: P Pattern
Expand All @@ -102,38 +104,60 @@ p_anchor = (char '^' >> liftM PCarat char_index)
<?> "empty () or anchor ^ or $"

char_index :: P DoPa
char_index = do (gi,ci) <- getState
let ci' = succ ci
setState (gi,ci')
return (DoPa ci')
char_index = do
(gi, ci) <- getState
let ci' = succ ci
setState (gi, ci')
return $ DoPa ci'

p_char :: P Pattern
p_char = p_dot <|> p_left_brace <|> p_escaped <|> p_other_char where
p_dot = char '.' >> char_index >>= return . PDot
p_left_brace = try $ (char '{' >> notFollowedBy digit >> char_index >>= return . (`PChar` '{'))
p_escaped = char '\\' >> anyChar >>= \c -> char_index >>= return . (`PEscape` c)
p_other_char = noneOf specials >>= \c -> char_index >>= return . (`PChar` c)
where specials = "^.[$()|*+?{\\"
p_char = p_dot <|> p_left_brace <|> p_escaped <|> p_other_char
where
p_dot = do
_ <- char '.'
PDot <$> char_index

p_left_brace = try $ do
_ <- char '{'
_ <- notFollowedBy digit
flip PChar '{' <$> char_index

p_escaped = do
_ <- char '\\'
flip PEscape <$> anyChar <*> char_index

p_other_char = flip PChar <$> noneOf "^.[$()|*+?{\\" <*> char_index

-- parse [bar] and [^bar] sets of characters
p_bracket :: P Pattern
p_bracket = (char '[') >> ( (char '^' >> p_set True) <|> (p_set False) )

p_set :: Bool -> P Pattern
p_set invert = do initial <- option "" (char ']' >> return "]")
values <- if null initial then many1 p_set_elem else many p_set_elem
_ <- char ']'
ci <- char_index
let chars = maybe'set $ concat $
initial :
[ c | BEChar c <- values ] :
[ [start..end] | BERange start end <- values ]
colls = maybe'set [PatternSetCollatingElement coll | BEColl coll <- values ]
equivs = maybe'set [PatternSetEquivalenceClass equiv | BEEquiv equiv <- values]
class's = maybe'set [PatternSetCharacterClass a'class | BEClass a'class <- values]
maybe'set x = if null x then Nothing else Just (Set.fromList x)
sets = PatternSet chars class's colls equivs
sets `seq` return $ if invert then PAnyNot ci sets else PAny ci sets
p_set invert = do
-- A ] as first character after the opening [ is treated as alternative ']'
-- rather than the closing bracket.
initial <- option mempty $ Set.singleton <$> char ']'
-- Parse remaining content of bracket expression.
values <- if Set.null initial then many1 p_set_elem else many p_set_elem
_ <- char ']'
ci <- char_index
-- Process the content of bracket expression into a PatternSet.
let !sets = foldl' (flip addBracketElement) (mempty{ _patternSetChars = initial }) values
return $ if invert then PAnyNot ci sets else PAny ci sets

addBracketElement :: BracketElement -> PatternSet -> PatternSet
addBracketElement = \case
BEChar c ->
over patternSetChars $ Set.insert c
BERange start end ->
over patternSetChars $ (`Set.union` Set.fromDistinctAscList [start..end])
-- Set.union is left-biased, [start..end] is considered the smaller set
BEClass s ->
over patternSetCharacterClasses $ Set.insert $ PatternSetCharacterClass s
BEColl s ->
over patternSetCollatingElements $ Set.insert $ PatternSetCollatingElement s
BEEquiv s ->
over patternSetEquivalenceClasses $ Set.insert $ PatternSetEquivalenceClass s

-- From here down the code is the parser and functions for pattern [ ] set things

Expand Down Expand Up @@ -161,15 +185,13 @@ p_set_elem_coll = liftM BEColl $

p_set_elem_range :: P BracketElement
p_set_elem_range = try $ do
start <- noneOf "]"
_ <- char '-'
end <- noneOf "]"
start <- noneOf "]-"
Copy link
Member Author

@andreasabel andreasabel Jul 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug here introduced by rebase.

_ <- char '-'
end <- noneOf "]"
return $ BERange start end

p_set_elem_char :: P BracketElement
p_set_elem_char = do
c <- noneOf "]"
return (BEChar c)
p_set_elem_char = BEChar <$> noneOf "]"

-- | Fail when 'BracketElement' is invalid, e.g. empty range @1-0@.
-- This failure should not be caught.
Expand Down
1 change: 1 addition & 0 deletions lib/Text/Regex/TDFA/TDFA.hs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import qualified Text.Regex.TDFA.IntArrTrieSet as Trie(lookupAsc,fromSinglesMerg
import Text.Regex.TDFA.Pattern(Pattern)
--import Text.Regex.TDFA.RunMutState(toInstructions)
import Text.Regex.TDFA.TNFA(patternToNFA)
import Utils
--import Debug.Trace

{- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}
Expand Down
Loading