Skip to content

Commit

Permalink
Version 0.1.4.0. Redesigned Range (#61)
Browse files Browse the repository at this point in the history
* Version 0.1.4.0. Redesigned Range

* Range update

* More tests for the Test god, minor fixes

* Added cobot
  • Loading branch information
vks4git authored Oct 7, 2021
1 parent b3c53a7 commit eed0c67
Show file tree
Hide file tree
Showing 20 changed files with 1,289 additions and 369 deletions.
5 changes: 5 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

## [Unreleased]

## [0.1.4.0] - 2021-09-27
### Changed
- Redesigned the Range type to reflect all possible cases.
- Switched to Megaparsec.

## [0.1.3.25] - 2021-09-15
### Added
- Pedantic build and CI
Expand Down
23 changes: 13 additions & 10 deletions package.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: cobot-io
version: 0.1.3.25
version: 0.1.4.0
github: "biocad/cobot-io"
license: BSD3
category: Bio
Expand All @@ -20,21 +20,24 @@ extra-source-files:
description: Please see the README on GitHub at <https://github.com/biocad/cobot-io#readme>

dependencies:
- base >= 4.7 && < 5
- data-msgpack >= 0.0.9 && < 0.1
- text >= 1.2.2.1 && < 1.3
- bytestring >= 0.10.8.1 && < 0.11
- split
- base >= 4.14 && < 5
- array >= 0.5 && < 0.6
- attoparsec >= 0.10 && < 0.15
- binary >= 0.8.3.0 && < 1.0
- mtl >= 2.2.1 && < 2.3.0
- hyraxAbif >= 0.2.3.27 && < 0.2.4.0
- bytestring >= 0.10.8.1 && < 0.11
- cobot >= 0.1.1.7
- containers >= 0.5.7.1 && < 0.7
- http-conduit >= 2.3 && < 2.4
- array >= 0.5 && < 0.6
- data-msgpack >= 0.0.9 && < 0.1
- deepseq >= 1.4 && < 1.5
- http-conduit >= 2.3 && < 2.4
- hyraxAbif >= 0.2.3.27 && < 0.2.4.0
- lens >= 4.16 && < 5.1
- linear >= 1.20 && < 1.22
- megaparsec >= 9.0.1
- mtl >= 2.2.1 && < 2.3.0
- parser-combinators >= 1.2.1
- split
- text >= 1.2.2.1 && < 1.3
- vector

library:
Expand Down
12 changes: 3 additions & 9 deletions src/Bio/GB.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
{-# LANGUAGE CPP #-}

module Bio.GB
( module T
, fromFile
Expand All @@ -13,19 +11,15 @@ import Bio.GB.Parser
import Bio.GB.Type as T
import Bio.GB.Writer (genBankToText)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Attoparsec.Text (parseOnly)
import Data.Bifunctor (first)
import Data.Text (Text, pack)
import qualified Data.Text.IO as TIO (readFile, writeFile)
#if !MIN_VERSION_base(4,13,0)
import Control.Monad.Fail (MonadFail(..))
import Prelude hiding (fail)
#endif
import Text.Megaparsec (eof, errorBundlePretty, parse)

-- | Reads 'GenBankSequence' from givem file.
--
fromFile :: (MonadFail m, MonadIO m) => FilePath -> m GenBankSequence
fromFile f = liftIO (TIO.readFile f) >>= either fail pure . parseOnly genBankP
fromFile f = liftIO (TIO.readFile f) >>= either (fail . errorBundlePretty) pure . parse (genBankP <* eof) ""

-- | Writes 'GenBankSequence' to file.
--
Expand All @@ -35,7 +29,7 @@ toFile s f = liftIO $ TIO.writeFile f $ genBankToText s
-- | Reads 'GenBankSequence' from 'Text'.
--
fromText :: Text -> Either Text GenBankSequence
fromText = first pack . parseOnly genBankP
fromText = first (pack . errorBundlePretty) . parse (genBankP <* eof) ""

-- | Writes 'GenBankSequence' to 'Text'.
--
Expand Down
149 changes: 90 additions & 59 deletions src/Bio/GB/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,28 @@

module Bio.GB.Parser
( genBankP
, rangeP
) where

import Bio.GB.Type (Feature (..), Form (..), GenBankSequence (..), Locus (..),
Meta (..), Reference (..), Source (..), Version (..))
import Bio.Sequence (MarkedSequence, Range, markedSequence)
import Control.Applicative ((<|>))
import Data.Attoparsec.Combinator (manyTill)
import Data.Attoparsec.Text (Parser, char, decimal, digit, endOfInput, endOfLine, letter,
many', many1', satisfy, string, takeWhile, takeWhile1, (<?>))
import Data.Bifunctor (bimap)
Meta (..), Parser, Reference (..), Source (..), Version (..))
import Bio.Sequence (Border (..), MarkedSequence, Range (..), RangeBorder (..),
markedSequence, shiftRange)
import Control.Monad.Combinators (many, manyTill, optional, some, (<|>))
import Data.Char (isAlphaNum, isSpace, isUpper)
import Data.Functor (($>))
import Data.Text (Text, intercalate, pack, splitOn, unpack)
import Prelude hiding (takeWhile)
import Text.Megaparsec (option, satisfy, sepBy1, takeWhile1P, takeWhileP, try, (<?>))
import Text.Megaparsec.Char (char, digitChar, eol, letterChar, string)
import Text.Megaparsec.Char.Lexer (decimal)

-- | Parser of .gb file.
--
genBankP :: Parser GenBankSequence
genBankP = GenBankSequence
<$> (metaP <?> "Meta parser")
<*> (gbSeqP <?> "GB sequence parser")
<* string "//" <* eolSpaceP <* endOfInput
<* string "//" <* eolSpaceP

--------------------------------------------------------------------------------
-- Block with meta-information.
Expand All @@ -33,13 +33,13 @@ metaP :: Parser Meta
metaP = do
locus' <- locusP <?> "Locus parser"

definitionM <- wrapMP definitionP <?> "Definition parser"
accessionM <- wrapMP accessionP <?> "Accession parser"
versionM <- wrapMP versionP <?> "Version parser"
keywordsM <- wrapMP keywordsP <?> "Keywords parser"
sourceM <- wrapMP sourceP <?> "Source parser"
referencesL <- many' referenceP <?> "References parser"
commentsL <- many' commentP <?> "Comments parser"
definitionM <- optional definitionP <?> "Definition parser"
accessionM <- optional accessionP <?> "Accession parser"
versionM <- optional versionP <?> "Version parser"
keywordsM <- optional keywordsP <?> "Keywords parser"
sourceM <- optional sourceP <?> "Source parser"
referencesL <- many referenceP <?> "References parser"
commentsL <- many commentP <?> "Comments parser"

pure $ Meta locus' definitionM accessionM versionM keywordsM sourceM referencesL commentsL

Expand All @@ -48,57 +48,57 @@ locusP = string "LOCUS" *> space *> (Locus
<$> textP <* space -- name
<*> decimal <* space <* string "bp" <* space -- sequence length
<*> textP <* space -- molecule type
<*> wrapMP formP <* space -- form of sequence
<*> wrapMP (pack <$> many1' (satisfy isUpper)) <* space -- GenBank division
<*> optional formP <* space -- form of sequence
<*> optional (pack <$> some (satisfy isUpper)) <* space -- GenBank division
<*> textP -- modification date
<* eolSpaceP)
where
textP = takeWhile1 $ not . isSpace
textP = takeWhile1P Nothing $ not . isSpace

formP :: Parser Form
formP = (string "linear" $> Linear) <|> (string "circular" $> Circular)
formP = try (string "linear" $> Linear) <|> (string "circular" $> Circular)

definitionP :: Parser Text
definitionP = string "DEFINITION" *> space *> (emptyP <|> someLinesP)
definitionP = string "DEFINITION" *> space *> (try emptyP <|> someLinesP)

accessionP :: Parser Text
accessionP = string "ACCESSION" *> space *> (emptyP <|> (pack
<$> many1' (alphaNumChar <|> char '_')
accessionP = string "ACCESSION" *> space *> (try emptyP <|> (pack
<$> some (try alphaNumChar <|> char '_')
<* eolSpaceP))

versionP :: Parser Version
versionP = string "VERSION" *> space
*> ((Version <$> emptyP <*> pure Nothing) <|> (Version
<$> (pack <$> many1' versionP')
<*> wrapMP (pack <$> (space *> string "GI:" *> many1' versionP'))
<$> (pack <$> some versionP')
<*> optional (pack <$> (space *> string "GI:" *> some versionP'))
<* eolSpaceP))
where
versionP' = alphaNumChar <|> char '_' <|> char '.'
versionP' = try alphaNumChar <|> try (char '_') <|> char '.'

keywordsP :: Parser Text
keywordsP = string "KEYWORDS"
*> (emptyP
*> (try emptyP
<|> (space *> textWithSpacesP <* eolSpaceP))

sourceP :: Parser Source
sourceP = string "SOURCE" *> space
*> (Source
<$> someLinesP
<*> wrapMP organismP)
<*> optional organismP)
where
organismP = string " ORGANISM" *> space *> someLinesP

referenceP :: Parser Reference
referenceP = string "REFERENCE" *> space
*> (((\x -> Reference x Nothing Nothing Nothing Nothing) <$> emptyP) <|> (Reference
<$> someLinesP
<*> wrapMP (string " AUTHORS" *> space *> someLinesP)
<*> wrapMP (string " TITLE" *> space *> someLinesP)
<*> wrapMP (string " JOURNAL" *> space *> someLinesP)
<*> wrapMP (string " PUBMED" *> space *> someLinesP)))
<*> optional (string " AUTHORS" *> space *> someLinesP)
<*> optional (string " TITLE" *> space *> someLinesP)
<*> optional (string " JOURNAL" *> space *> someLinesP)
<*> optional (string " PUBMED" *> space *> someLinesP)))

commentP :: Parser Text
commentP = string "COMMENT" *> (emptyP <|> (many' (char ' ') *> someLinesP))
commentP = string "COMMENT" *> (try emptyP <|> (many (char ' ') *> someLinesP))

--------------------------------------------------------------------------------
-- Block with FEATURES table.
Expand All @@ -108,37 +108,63 @@ featuresP :: Parser [(Feature, Range)]
featuresP = -- skip unknown fields and stop on line with "FEATURES"
manyTill (textWithSpacesP <* eolSpaceP) (string "FEATURES") *> space
*> textWithSpacesP <* eolSpaceP
*> many1' (featureP <?> "Single feature parser")
*> some (featureP <?> "Single feature parser")

featureP :: Parser (Feature, Range)
featureP = do
_ <- string featureIndent1

featureName' <- takeWhile (not . isSpace) <* space
(strand53, range) <- rangeP <* eolSpaceP
featureName' <- takeWhileP Nothing (not . isSpace) <* space
range <- rangeP <* eolSpaceP

props <- many1' propsP
props <- some propsP

pure (Feature featureName' strand53 props, range)
-- | Ranges are 1-based, but the underlying Vector in the Feature is 0-based.
-- We shift the range left so the numberings match.
--
pure (Feature featureName' props, shiftRange (-1) range)

rangeP :: Parser (Bool, Range)
rangeP = (string "join" *> fail "Unsupported range with join(..)")
<|> (string "complement(" *> rP False <* char ')')
<|> rP True
rangeP :: Parser Range
rangeP = try spanP
<|> try betweenP
<|> try pointP
<|> try joinP
<|> complementP
where
rP :: Bool -> Parser (Bool, Range)
rP b = fmap (bimap pred id)
<$> (,) b
<$> (((,) <$> decimal <* string ".." <*> decimal) <|> ((\x -> (x, x)) <$> decimal))
spanP :: Parser Range
spanP = do
lowerBorderType <- option Precise (try $ char '<' *> pure Exceeded)
lowerBorderLocation <- decimal
_ <- string ".."
upperBorderType <- option Precise (try $ char '>' *> pure Exceeded)
upperBorderLocation <- decimal
pure $ Span (RangeBorder lowerBorderType lowerBorderLocation) (RangeBorder upperBorderType upperBorderLocation)

betweenP :: Parser Range
betweenP = do
before <- decimal
_ <- char '^'
after <- decimal
pure $ Between before after

pointP :: Parser Range
pointP = fmap Point decimal

joinP :: Parser Range
joinP = string "join(" *> fmap Join (rangeP `sepBy1` char ',') <* char ')'

complementP :: Parser Range
complementP = fmap Complement $ string "complement(" *> rangeP <* char ')'


propsP :: Parser (Text, Text)
propsP = do
_ <- string featureIndent2
_ <- char '/'
propName <- takeWhile1 (/= '=')
propName <- takeWhile1P Nothing (/= '=')
_ <- char '='

propText <- ((char '\"' *> takeWhile1 (/= '\"') <* char '\"')
propText <- try ((char '\"' *> takeWhile1P Nothing (/= '\"') <* char '\"')
<|> textWithSpacesP)
<* eolSpaceP

Expand All @@ -163,8 +189,8 @@ featureIndent2 = pack $ replicate 21 ' '
originP :: Parser String
originP = (string "ORIGIN" <?> "String ORIGIN") *> eolSpaceP
*> pure toText
<*> many1' (space *> many1' digit *> space1
*> many1' (many1' letter <* (space1 <|> eolSpaceP)))
<*> some (space *> some digitChar *> space1
*> some (some letterChar <* (try space1 <|> eolSpaceP)))
where
toText :: [[String]] -> String
toText = concat . fmap concat
Expand All @@ -175,6 +201,14 @@ originP = (string "ORIGIN" <?> "String ORIGIN") *> eolSpaceP
gbSeqP :: Parser (MarkedSequence Feature Char)
gbSeqP = do
features <- (featuresP <?> "Features parser")

-- | An extract from the GB specification (https://www.ncbi.nlm.nih.gov/genbank/release/current/):
-- NOTE: The BASE COUNT linetype is obsolete and was removed
-- from the GenBank flatfile format in October 2003.
-- Anyway, here, in 2021, we still might get plasmids with the BASE COUNT line present.
--
_ <- optional $ try (string "BASE COUNT" *> textWithSpacesP *> eol)

origin <- (originP <?> "Origin parser")

either (fail . unpack) pure (markedSequence origin features)
Expand All @@ -189,29 +223,26 @@ firstIndent :: Text
firstIndent = pack $ replicate 12 ' '

eolSpaceP :: Parser ()
eolSpaceP = () <$ many' (char ' ') <* endOfLine
eolSpaceP = () <$ many (char ' ') <* eol

emptyP :: Parser Text
emptyP = many' (char ' ') *> char '.' *> eolSpaceP *> pure "."
emptyP = many (char ' ') *> char '.' *> eolSpaceP *> pure "."

textWithSpacesP :: Parser Text
textWithSpacesP = takeWhile (`notElem` ['\n', '\r'])
textWithSpacesP = takeWhileP Nothing (`notElem` ['\n', '\r'])

someLinesP :: Parser Text
someLinesP = intercalate "\n" <$> someLinesIndentP firstIndent

someLinesIndentP :: Text -> Parser [Text]
someLinesIndentP indent = (:) <$> textWithSpacesP <* eolSpaceP
<*> (many' (string indent *> textWithSpacesP <* eolSpaceP))

wrapMP :: Parser a -> Parser (Maybe a)
wrapMP p = fmap Just p <|> pure Nothing
<*> (many (string indent *> textWithSpacesP <* eolSpaceP))

space :: Parser ()
space = () <$ (many' $ satisfy isSpace)
space = () <$ (many $ satisfy isSpace)

space1 :: Parser ()
space1 = () <$ (many1' $ satisfy isSpace)
space1 = () <$ (some $ satisfy isSpace)

alphaNumChar :: Parser Char
alphaNumChar = satisfy isAlphaNum
Loading

0 comments on commit eed0c67

Please sign in to comment.