Version 0.1.4.0. Redesigned Range (#61)

* Version 0.1.4.0. Redesigned Range * Range update * More tests for the Test god, minor fixes * Added cobot
biocad · Oct 7, 2021 · eed0c67 · eed0c67
1 parent b3c53a7
commit eed0c67
Show file tree

Hide file tree

Showing 20 changed files with 1,289 additions and 369 deletions.
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -2,6 +2,11 @@
 
 ## [Unreleased]
 
+## [0.1.4.0] - 2021-09-27
+### Changed
+- Redesigned the Range type to reflect all possible cases.
+- Switched to Megaparsec.
+
 ## [0.1.3.25] - 2021-09-15
 ### Added
 - Pedantic build and CI

diff --git a/package.yaml b/package.yaml
@@ -1,5 +1,5 @@
 name:                cobot-io
-version:             0.1.3.25
+version:             0.1.4.0
 github:              "biocad/cobot-io"
 license:             BSD3
 category:            Bio
@@ -20,21 +20,24 @@ extra-source-files:
 description:         Please see the README on GitHub at <https://github.com/biocad/cobot-io#readme>
 
 dependencies:
-- base >= 4.7 && < 5
-- data-msgpack >= 0.0.9 && < 0.1
-- text >= 1.2.2.1 && < 1.3
-- bytestring >= 0.10.8.1 && < 0.11
-- split
+- base >= 4.14 && < 5
+- array >= 0.5 && < 0.6
 - attoparsec >= 0.10 && < 0.15
 - binary >= 0.8.3.0 && < 1.0
-- mtl >= 2.2.1 && < 2.3.0
-- hyraxAbif >= 0.2.3.27 && < 0.2.4.0
+- bytestring >= 0.10.8.1 && < 0.11
+- cobot >= 0.1.1.7 
 - containers >= 0.5.7.1 && < 0.7
-- http-conduit >= 2.3 && < 2.4
-- array >= 0.5 && < 0.6
+- data-msgpack >= 0.0.9 && < 0.1
 - deepseq >= 1.4 && < 1.5
+- http-conduit >= 2.3 && < 2.4
+- hyraxAbif >= 0.2.3.27 && < 0.2.4.0
 - lens >= 4.16 && < 5.1
 - linear >= 1.20 && < 1.22
+- megaparsec >= 9.0.1
+- mtl >= 2.2.1 && < 2.3.0
+- parser-combinators >= 1.2.1
+- split
+- text >= 1.2.2.1 && < 1.3
 - vector
 
 library:

diff --git a/src/Bio/GB.hs b/src/Bio/GB.hs
@@ -1,5 +1,3 @@
-{-# LANGUAGE CPP #-}
-
 module Bio.GB
   ( module T
   , fromFile
@@ -13,19 +11,15 @@ import           Bio.GB.Parser
 import           Bio.GB.Type            as T
 import           Bio.GB.Writer          (genBankToText)
 import           Control.Monad.IO.Class (MonadIO, liftIO)
-import           Data.Attoparsec.Text   (parseOnly)
 import           Data.Bifunctor         (first)
 import           Data.Text              (Text, pack)
 import qualified Data.Text.IO           as TIO (readFile, writeFile)
-#if !MIN_VERSION_base(4,13,0)
-import           Control.Monad.Fail     (MonadFail(..))
-import           Prelude                hiding (fail)
-#endif
+import           Text.Megaparsec        (eof, errorBundlePretty, parse)
 
 -- | Reads 'GenBankSequence' from givem file.
 --
 fromFile :: (MonadFail m, MonadIO m) => FilePath -> m GenBankSequence
-fromFile f = liftIO (TIO.readFile f) >>= either fail pure . parseOnly genBankP
+fromFile f = liftIO (TIO.readFile f) >>= either (fail . errorBundlePretty) pure . parse (genBankP <* eof) ""
 
 -- | Writes 'GenBankSequence' to file.
 --
@@ -35,7 +29,7 @@ toFile s f = liftIO $ TIO.writeFile f $ genBankToText s
 -- | Reads 'GenBankSequence' from 'Text'.
 --
 fromText :: Text -> Either Text GenBankSequence
-fromText = first pack . parseOnly genBankP
+fromText = first (pack . errorBundlePretty) . parse (genBankP <* eof) ""
 
 -- | Writes 'GenBankSequence' to 'Text'.
 --

diff --git a/src/Bio/GB/Parser.hs b/src/Bio/GB/Parser.hs
@@ -2,28 +2,28 @@
 
 module Bio.GB.Parser
   ( genBankP
+  , rangeP
   ) where
 
 import Bio.GB.Type                (Feature (..), Form (..), GenBankSequence (..), Locus (..),
-                                   Meta (..), Reference (..), Source (..), Version (..))
-import Bio.Sequence               (MarkedSequence, Range, markedSequence)
-import Control.Applicative        ((<|>))
-import Data.Attoparsec.Combinator (manyTill)
-import Data.Attoparsec.Text       (Parser, char, decimal, digit, endOfInput, endOfLine, letter,
-                                   many', many1', satisfy, string, takeWhile, takeWhile1, (<?>))
-import Data.Bifunctor             (bimap)
+                                   Meta (..), Parser, Reference (..), Source (..), Version (..))
+import Bio.Sequence               (Border (..), MarkedSequence, Range (..), RangeBorder (..),
+                                   markedSequence, shiftRange)
+import Control.Monad.Combinators  (many, manyTill, optional, some, (<|>))
 import Data.Char                  (isAlphaNum, isSpace, isUpper)
 import Data.Functor               (($>))
 import Data.Text                  (Text, intercalate, pack, splitOn, unpack)
-import Prelude                    hiding (takeWhile)
+import Text.Megaparsec            (option, satisfy, sepBy1, takeWhile1P, takeWhileP, try, (<?>))
+import Text.Megaparsec.Char       (char, digitChar, eol, letterChar, string)
+import Text.Megaparsec.Char.Lexer (decimal)
 
 -- | Parser of .gb file.
 --
 genBankP :: Parser GenBankSequence
 genBankP =  GenBankSequence
         <$> (metaP <?> "Meta parser")
         <*> (gbSeqP <?> "GB sequence parser")
-        <*  string "//" <* eolSpaceP <* endOfInput
+        <*  string "//" <* eolSpaceP 
 
 --------------------------------------------------------------------------------
 -- Block with meta-information.
@@ -33,13 +33,13 @@ metaP :: Parser Meta
 metaP = do
   locus'      <- locusP <?> "Locus parser"
 
-  definitionM <- wrapMP definitionP <?> "Definition parser"
-  accessionM  <- wrapMP accessionP <?> "Accession parser"
-  versionM    <- wrapMP versionP <?> "Version parser"
-  keywordsM   <- wrapMP keywordsP <?> "Keywords parser"
-  sourceM     <- wrapMP sourceP <?> "Source parser"
-  referencesL <- many' referenceP <?> "References parser"
-  commentsL   <- many' commentP <?> "Comments parser"
+  definitionM <- optional definitionP <?> "Definition parser"
+  accessionM  <- optional accessionP <?> "Accession parser"
+  versionM    <- optional versionP <?> "Version parser"
+  keywordsM   <- optional keywordsP <?> "Keywords parser"
+  sourceM     <- optional sourceP <?> "Source parser"
+  referencesL <- many referenceP <?> "References parser"
+  commentsL   <- many commentP <?> "Comments parser"
 
   pure $ Meta locus' definitionM accessionM versionM keywordsM sourceM referencesL commentsL
 
@@ -48,57 +48,57 @@ locusP = string "LOCUS" *> space *> (Locus
        <$> textP <* space                                      -- name
        <*> decimal <* space <* string "bp" <* space            -- sequence length
        <*> textP <* space                                      -- molecule type
-       <*> wrapMP formP <* space                               -- form of sequence
-       <*> wrapMP (pack <$> many1' (satisfy isUpper)) <* space -- GenBank division
+       <*> optional formP <* space                               -- form of sequence
+       <*> optional (pack <$> some (satisfy isUpper)) <* space   -- GenBank division
        <*> textP                                               -- modification date
        <*  eolSpaceP)
   where
-    textP = takeWhile1 $ not . isSpace
+    textP = takeWhile1P Nothing $ not . isSpace
 
     formP :: Parser Form
-    formP = (string "linear" $> Linear) <|> (string "circular" $> Circular)
+    formP = try (string "linear" $> Linear) <|> (string "circular" $> Circular)
 
 definitionP :: Parser Text
-definitionP =  string "DEFINITION" *> space *> (emptyP <|> someLinesP)
+definitionP =  string "DEFINITION" *> space *> (try emptyP <|> someLinesP)
 
 accessionP :: Parser Text
-accessionP =  string "ACCESSION" *> space *> (emptyP <|> (pack
-          <$> many1' (alphaNumChar <|> char '_')
+accessionP =  string "ACCESSION" *> space *> (try emptyP <|> (pack
+          <$> some (try alphaNumChar <|> char '_')
           <*  eolSpaceP))
 
 versionP :: Parser Version
 versionP =  string "VERSION" *> space
          *> ((Version <$> emptyP <*> pure Nothing) <|> (Version
-        <$> (pack <$> many1' versionP')
-        <*> wrapMP (pack <$> (space *> string "GI:" *> many1' versionP'))
+        <$> (pack <$> some versionP')
+        <*> optional (pack <$> (space *> string "GI:" *> some versionP'))
         <*  eolSpaceP))
   where
-    versionP' = alphaNumChar <|> char '_' <|> char '.'
+    versionP' = try alphaNumChar <|> try (char '_') <|> char '.'
 
 keywordsP :: Parser Text
 keywordsP =  string "KEYWORDS"
-          *> (emptyP
+          *> (try emptyP
          <|> (space *> textWithSpacesP <* eolSpaceP))
 
 sourceP :: Parser Source
 sourceP =  string "SOURCE" *> space
         *> (Source
        <$> someLinesP
-       <*> wrapMP organismP)
+       <*> optional organismP)
   where
     organismP = string "  ORGANISM" *> space *> someLinesP
 
 referenceP :: Parser Reference
 referenceP = string "REFERENCE" *> space
            *> (((\x -> Reference x Nothing Nothing Nothing Nothing) <$> emptyP) <|> (Reference
           <$> someLinesP
-          <*> wrapMP (string "  AUTHORS" *> space *> someLinesP)
-          <*> wrapMP (string "  TITLE" *> space *> someLinesP)
-          <*> wrapMP (string "  JOURNAL" *> space *> someLinesP)
-          <*> wrapMP (string "  PUBMED" *> space *> someLinesP)))
+          <*> optional (string "  AUTHORS" *> space *> someLinesP)
+          <*> optional (string "  TITLE" *> space *> someLinesP)
+          <*> optional (string "  JOURNAL" *> space *> someLinesP)
+          <*> optional (string "  PUBMED" *> space *> someLinesP)))
 
 commentP :: Parser Text
-commentP = string "COMMENT" *> (emptyP <|> (many' (char ' ') *> someLinesP))
+commentP = string "COMMENT" *> (try emptyP <|> (many (char ' ') *> someLinesP))
 
 --------------------------------------------------------------------------------
 -- Block with FEATURES table.
@@ -108,37 +108,63 @@ featuresP :: Parser [(Feature, Range)]
 featuresP = -- skip unknown fields and stop on line with "FEATURES" 
           manyTill (textWithSpacesP <* eolSpaceP) (string "FEATURES") *> space
           *> textWithSpacesP <* eolSpaceP
-          *> many1' (featureP <?> "Single feature parser")
+          *> some (featureP <?> "Single feature parser")
 
 featureP :: Parser (Feature, Range)
 featureP = do
     _ <- string featureIndent1
 
-    featureName'      <- takeWhile (not . isSpace) <* space
-    (strand53, range) <- rangeP <* eolSpaceP
+    featureName'      <- takeWhileP Nothing (not . isSpace) <* space
+    range <- rangeP <* eolSpaceP
 
-    props <- many1' propsP
+    props <- some propsP
 
-    pure (Feature featureName' strand53 props, range)
+    -- | Ranges are 1-based, but the underlying Vector in the Feature is 0-based.
+    -- We shift the range left so the numberings match.
+    --
+    pure (Feature featureName' props, shiftRange (-1) range)
 
-rangeP :: Parser (Bool, Range)
-rangeP =  (string "join" *> fail "Unsupported range with join(..)")
-      <|> (string "complement(" *> rP False <* char ')') 
-      <|> rP True
+rangeP :: Parser Range
+rangeP =  try spanP 
+      <|> try betweenP 
+      <|> try pointP
+      <|> try joinP
+      <|> complementP
   where
-    rP :: Bool -> Parser (Bool, Range)
-    rP b =  fmap (bimap pred id)
-        <$> (,) b
-        <$> (((,) <$> decimal <* string ".." <*> decimal) <|> ((\x -> (x, x)) <$> decimal))
+    spanP :: Parser Range
+    spanP = do
+        lowerBorderType <- option Precise (try $ char '<' *> pure Exceeded)
+        lowerBorderLocation <- decimal
+        _ <- string ".."
+        upperBorderType <- option Precise (try $ char '>' *> pure Exceeded)
+        upperBorderLocation <- decimal
+        pure $ Span (RangeBorder lowerBorderType lowerBorderLocation) (RangeBorder upperBorderType upperBorderLocation) 
+
+    betweenP :: Parser Range
+    betweenP = do
+        before <- decimal
+        _ <- char '^'
+        after <- decimal
+        pure $ Between before after
+
+    pointP :: Parser Range
+    pointP = fmap Point decimal
+
+    joinP :: Parser Range
+    joinP = string "join(" *> fmap Join (rangeP `sepBy1` char ',') <* char ')'
+
+    complementP :: Parser Range
+    complementP = fmap Complement $ string "complement(" *> rangeP <* char ')'
+
 
 propsP :: Parser (Text, Text)
 propsP = do
     _ <- string featureIndent2
     _ <- char '/'
-    propName <- takeWhile1 (/= '=')
+    propName <- takeWhile1P Nothing (/= '=')
     _ <- char '='
 
-    propText <- ((char '\"' *> takeWhile1 (/= '\"') <* char '\"')
+    propText <- try ((char '\"' *> takeWhile1P Nothing (/= '\"') <* char '\"')
              <|> textWithSpacesP)
              <* eolSpaceP
 
@@ -163,8 +189,8 @@ featureIndent2 = pack $ replicate 21 ' '
 originP :: Parser String
 originP =  (string "ORIGIN" <?> "String ORIGIN") *> eolSpaceP
         *> pure toText
-       <*> many1' (space *> many1' digit *> space1
-        *> many1' (many1' letter <* (space1 <|> eolSpaceP)))
+       <*> some (space *> some digitChar *> space1
+        *> some (some letterChar <* (try space1 <|> eolSpaceP)))
   where
     toText :: [[String]] -> String
     toText = concat . fmap concat
@@ -175,6 +201,14 @@ originP =  (string "ORIGIN" <?> "String ORIGIN") *> eolSpaceP
 gbSeqP :: Parser (MarkedSequence Feature Char)
 gbSeqP = do
     features <- (featuresP <?> "Features parser")
+
+    -- | An extract from the GB specification (https://www.ncbi.nlm.nih.gov/genbank/release/current/):
+    --    NOTE: The BASE COUNT linetype is obsolete and was removed
+    --    from the GenBank flatfile format in October 2003.
+    --  Anyway, here, in 2021, we still might get plasmids with the BASE COUNT line present.
+    --
+    _ <- optional $ try (string "BASE COUNT" *> textWithSpacesP *> eol)
+
     origin   <- (originP <?> "Origin parser")
 
     either (fail . unpack) pure (markedSequence origin features)
@@ -189,29 +223,26 @@ firstIndent :: Text
 firstIndent = pack $ replicate 12 ' '
 
 eolSpaceP :: Parser ()
-eolSpaceP = () <$ many' (char ' ') <* endOfLine
+eolSpaceP = () <$ many (char ' ') <* eol
 
 emptyP :: Parser Text
-emptyP = many' (char ' ') *> char '.' *> eolSpaceP *> pure "."
+emptyP = many (char ' ') *> char '.' *> eolSpaceP *> pure "."
 
 textWithSpacesP :: Parser Text
-textWithSpacesP = takeWhile (`notElem` ['\n', '\r'])
+textWithSpacesP = takeWhileP Nothing (`notElem` ['\n', '\r'])
 
 someLinesP :: Parser Text
 someLinesP = intercalate "\n" <$> someLinesIndentP firstIndent
 
 someLinesIndentP :: Text -> Parser [Text]
 someLinesIndentP indent =  (:) <$> textWithSpacesP <* eolSpaceP
-                       <*> (many' (string indent *> textWithSpacesP <* eolSpaceP))
-
-wrapMP :: Parser a -> Parser (Maybe a)
-wrapMP p = fmap Just p <|> pure Nothing
+                       <*> (many (string indent *> textWithSpacesP <* eolSpaceP))
 
 space :: Parser ()
-space = () <$ (many' $ satisfy isSpace)
+space = () <$ (many $ satisfy isSpace)
 
 space1 :: Parser ()
-space1 = () <$ (many1' $ satisfy isSpace)
+space1 = () <$ (some $ satisfy isSpace)
 
 alphaNumChar :: Parser Char
 alphaNumChar = satisfy isAlphaNum