From 9e51618fa1d658695375fbc543d1c6cdaf138950 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 19 Dec 2024 18:43:24 -0800 Subject: [PATCH 1/2] Add Pod reader Pod ("Plain old documentation") is a markup languaged used principally to document Perl modules and programs. Since it was originally meant to be translated pretty directly to man, the semantics are fairly simple. This Pod reader was developed with reference to the canonical user and implementer documentation of Pod: https://perldoc.perl.org/perlpod and https://perldoc.perl.org/perlpodspec. There are 1490 .pod, .pl, and .pm in the Perl 5.34 distribution found in /System/Library/Perl on my mac. Of those, this reader dies with a parse error on 7 of them. All of them seem to be cases where pod commands are found within a non-colon-prefixed =begin/=end. perlpodspec says I may treat this as an error. [API change] adds readPod --- MANUAL.txt | 2 + pandoc.cabal | 3 + src/Text/Pandoc/Readers.hs | 3 + src/Text/Pandoc/Readers/Pod.hs | 441 +++++++++++++++++++++++++++++++++ test/Tests/Old.hs | 4 + test/Tests/Readers/Pod.hs | 175 +++++++++++++ test/pod-reader.native | 394 +++++++++++++++++++++++++++++ test/pod-reader.pod | 155 ++++++++++++ test/test-pandoc.hs | 2 + 9 files changed, 1179 insertions(+) create mode 100644 src/Text/Pandoc/Readers/Pod.hs create mode 100644 test/Tests/Readers/Pod.hs create mode 100644 test/pod-reader.native create mode 100644 test/pod-reader.pod diff --git a/MANUAL.txt b/MANUAL.txt index 4b24a8673cf5..d2063fa1d177 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -269,6 +269,7 @@ header when requesting a document from a URL: - `odt` ([OpenOffice text document][ODT]) - `opml` ([OPML]) - `org` ([Emacs Org mode]) + - `pod` (Perl's [Plain Old Documentation]) - `ris` ([RIS] bibliography) - `rtf` ([Rich Text Format]) - `rst` ([reStructuredText]) @@ -499,6 +500,7 @@ header when requesting a document from a URL: [OPML]: http://dev.opml.org/spec2.html [OpenDocument]: http://opendocument.xml.org [ODT]: https://en.wikipedia.org/wiki/OpenDocument +[Plain Old Documentation]: https://perldoc.perl.org/perlpod [Textile]: https://textile-lang.com [MediaWiki markup]: https://www.mediawiki.org/wiki/Help:Formatting [DokuWiki markup]: https://www.dokuwiki.org/dokuwiki diff --git a/pandoc.cabal b/pandoc.cabal index 7754b5902f7c..9bd47a9f9930 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -424,6 +424,7 @@ extra-source-files: test/odt/odt/*.odt test/odt/markdown/*.md test/odt/native/*.native + test/pod-reader.pod source-repository head type: git location: git://github.com/jgm/pandoc.git @@ -604,6 +605,7 @@ library Text.Pandoc.Readers.RTF, Text.Pandoc.Readers.Typst, Text.Pandoc.Readers.Djot, + Text.Pandoc.Readers.Pod, Text.Pandoc.Writers, Text.Pandoc.Writers.Native, Text.Pandoc.Writers.DocBook, @@ -837,6 +839,7 @@ test-suite test-pandoc Tests.Readers.Man Tests.Readers.Mdoc Tests.Readers.FB2 + Tests.Readers.Pod Tests.Readers.DokuWiki Tests.Writers.Native Tests.Writers.ConTeXt diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index 2efd061dffed..8f4800efb1c9 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -64,6 +64,7 @@ module Text.Pandoc.Readers , readRTF , readTypst , readDjot + , readPod -- * Miscellaneous , getReader , getDefaultExtensions @@ -100,6 +101,7 @@ import Text.Pandoc.Readers.Native import Text.Pandoc.Readers.ODT import Text.Pandoc.Readers.OPML import Text.Pandoc.Readers.Org +import Text.Pandoc.Readers.Pod import Text.Pandoc.Readers.RST import Text.Pandoc.Readers.Textile import Text.Pandoc.Readers.TikiWiki @@ -171,6 +173,7 @@ readers = [("native" , TextReader readNative) ,("typst" , TextReader readTypst) ,("djot" , TextReader readDjot) ,("mdoc" , TextReader readMdoc) + ,("pod" , TextReader readPod) ] -- | Retrieve reader, extensions based on format spec (format+extensions). diff --git a/src/Text/Pandoc/Readers/Pod.hs b/src/Text/Pandoc/Readers/Pod.hs new file mode 100644 index 000000000000..544d7f3aeacf --- /dev/null +++ b/src/Text/Pandoc/Readers/Pod.hs @@ -0,0 +1,441 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE TypeApplications #-} +{- | + Module : Text.Pandoc.Readers.Pod + Copyright : © 2024 Evan Silberman + License : GNU GPL, version 2 or above + + Maintainer : Evan Silberman + Stability : WIP + Portability : portable + +Conversion of Pod to 'Pandoc' documents +-} +module Text.Pandoc.Readers.Pod (readPod) where + +import Control.Monad (void) +import Control.Monad.Except (throwError) +import Data.Char (isAsciiUpper, digitToInt) +import Data.Default (Default) +import Text.Pandoc.Logging +import Text.Pandoc.Options +import Text.Pandoc.Parsing +import Text.Pandoc.Parsing.General (isSpaceChar) +import Text.Pandoc.XML (lookupEntity) +import Text.Pandoc.Class.PandocMonad (PandocMonad(..)) +import Text.Pandoc.Definition (Pandoc) +import Text.Pandoc.Error +import Text.Pandoc.Builder (Blocks, Inlines) +import qualified Text.Pandoc.Builder as B +import qualified Data.Text as T +import qualified Data.Text.Read as TR +import Text.Pandoc.Shared (stringify, textToIdentifier, tshow) +import Data.Set (Set) +import Data.Functor (($>)) +import Data.Maybe (listToMaybe, fromMaybe) +import Numeric (readOct) + +data PodState = PodState + { logMessages :: [LogMessage] + , headerIds :: Set T.Text + , options :: ReaderOptions +} deriving (Show) + +instance HasLogMessages PodState where + addLogMessage msg st = st{ logMessages = msg : logMessages st } + getLogMessages st = reverse $ logMessages st + +instance HasIdentifierList PodState where + extractIdentifierList = headerIds + updateIdentifierList f st = st{headerIds = f (headerIds st)} + +instance HasReaderOptions PodState where + extractReaderOptions = options + +instance Default PodState where + def = PodState + { logMessages = [] + , headerIds = mempty + , options = def + } + +data PodLinkDestination = LinkUrl Inlines T.Text + | LinkMan Inlines (Maybe Inlines) + | LinkInternal Inlines + deriving (Show) + +defaultLinkName :: PodLinkDestination -> Inlines +defaultLinkName (LinkUrl inl _) = inl +defaultLinkName (LinkMan nm (Just sec)) = B.doubleQuoted sec <> " in " <> nm +defaultLinkName (LinkMan nm Nothing) = nm +defaultLinkName (LinkInternal sec) = B.doubleQuoted sec + +type PodParser m = ParsecT Sources PodState m + +readPod :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc +readPod _ s = do + let sources = toSources s + p <- readWithM parsePod def sources + case p of + Right result -> return result + Left e -> throwError e + +parsePod :: PandocMonad m => PodParser m Pandoc +parsePod = do + -- We don't actually start processing Pod until we encounter a Pod command. + -- If we never encounter a Pod command, the document is still valid Pod, it + -- just contains no content. + notPod + bs <- manyTill block eof + reportLogMessages + return $ B.doc $ mconcat bs + +block :: PandocMonad m => PodParser m Blocks +block = verbatim <|> paragraph <|> command "Pod paragraph" + +command :: PandocMonad m => PodParser m Blocks +command = do + try (char '=' >> notFollowedBy (string "item" <|> string "back" <|> string "end")) + header <|> pod <|> cut <|> over <|> for <|> begin <|> encoding "Pod command" + +cmd :: PandocMonad m => T.Text -> PodParser m () +cmd nm = do + textStr nm + notFollowedBy nonspaceChar + void $ many spaceChar + +encoding :: PandocMonad m => PodParser m Blocks +encoding = do + cmd "encoding" + anyLine + logMessage $ IgnoredElement "=encoding; Pandoc requires UTF-8 input" + return mempty + +header :: PandocMonad m => PodParser m Blocks +header = do + string "head" + dig <- oneOf "123456" + void blankline <|> skipMany1 spaceChar + ins <- inlines + attrs <- registerHeader B.nullAttr ins + optional blanklines + return $ B.headerWith attrs (digitToInt dig) ins + +pod :: PandocMonad m => PodParser m Blocks +pod = do + cmd "pod" + optional (try inlines) + optional blanklines + return mempty + +cut :: PandocMonad m => PodParser m Blocks +cut = cmd "cut" *> notPod + +notPod :: PandocMonad m => PodParser m Blocks +notPod = do + manyTill anyLine (eof <|> void (try (lookAhead (char '=' *> letter)))) + return mempty + +over :: PandocMonad m => PodParser m Blocks +over = do + cmd "over" + anyLine + blanklines + optional $ try (char '=' *> cut) + bs <- list <|> blockquote + string "=back" <* blanklines + return bs + +list :: PandocMonad m => PodParser m Blocks +list = try bulletList <|> try orderedList <|> definitionList + +bulletList :: PandocMonad m => PodParser m Blocks +bulletList = B.bulletList <$> many1 (item (many spaceChar *> optional (char '*'))) + +orderedList :: PandocMonad m => PodParser m Blocks +orderedList = do + start <- item1 + more <- many orderedItem + return $ B.orderedList (start : more) + where + item1 = item $ spaces *> char '1' *> optional (char '.') + orderedItem = item $ spaces *> many digit *> optional (char '.') + +item :: PandocMonad m => PodParser m () -> PodParser m Blocks +item p = do + try (cmd "=item") + p + blanklines + mconcat <$> many block "runaway item" + +definitionList :: PandocMonad m => PodParser m Blocks +definitionList = B.definitionList <$> many1 dlItem + where + dlItem = do + try (cmd "=item") + spaces + term <- inlines + blanklines + -- perlpodspec sez the /section part of a link can refer to either + -- a header or a dl item, hence treating it as a "header" here + attrs <- registerHeader B.nullAttr term + defn <- mconcat <$> many block "runaway dlitem" + return (B.spanWith attrs term, [defn]) + +blockquote :: PandocMonad m => PodParser m Blocks +blockquote = B.blockQuote . mconcat <$> many block "runaway blockquote" + +paragraph :: PandocMonad m => PodParser m Blocks +paragraph = do + try (notFollowedBy (char '=' *> letter)) + inl <- inlines + optional blanklines + return $ B.para $ B.trimInlines inl + +inlines :: PandocMonad m => PodParser m Inlines +inlines = mconcat <$> many1 (format <|> whitespace <|> str) + +-- perlpodspec sez: +-- If a Pod processor sees any formatting code other than the ones listed, +-- that processor must by default treat this as an error. +format :: PandocMonad m => PodParser m Inlines +format = try $ do + ctrl <- satisfy isAsciiUpper + p <- getPosition + lookAhead (char '<') + case ctrl of + 'B' -> B.strong <$> argument + 'C' -> B.code . stringify <$> argument + 'F' -> B.spanWith (mempty, ["filename"], mempty) <$> argument + 'I' -> B.emph <$> argument + 'S' -> argument -- TODO map nbsps + 'X' -> argument $> mempty + 'Z' -> argument $> mempty + + 'E' -> do + a <- stringify <$> argument + case entity a of + -- per spec: + -- Pod parsers, when faced with some unknown "E" code, + -- shouldn't simply replace it with nullstring (by default, at + -- least), but may pass it through as a string consisting of the + -- literal characters E, less-than, identifier, greater-than. + Nothing -> do + logMessage $ SkippedContent ("unknown entity " <> a) p + return $ B.str $ "E<" <> a <> ">" + Just e -> return $ B.str e + + 'L' -> link + + x -> throwError $ PandocParseError $ T.snoc "unknown Pod formatting code " x + where + argument = try expandedArg <|> compactArg "argument" + innerStr = B.str <$> many1Char (podCharLess ">") + compactArg = do + char '<' + mconcat <$> manyTill (format <|> whitespace <|> innerStr) (char '>') + expandedArg = do + openLen <- length <$> many1 (char '<') + let close = T.pack $ replicate openLen '>' + skipMany1 spaceChar <|> void blankline + arg <- mconcat <$> many (format <|> try (whitespace <* notFollowedBy (textStr close)) <|> str) + many1 spaceChar + textStr close + return arg + -- Some legacy entity names are required to be parsed by Pod formatters + oct = listToMaybe . readOct @Integer + entity "apos" = Just "'" + entity "sol" = Just "/" + entity "verbar" = Just "|" + entity "lchevron" = Just "«" + entity "rchevron" = Just "»" + entity (T.stripPrefix "0x" -> Just suf) = lookupEntity $ "#x" <> suf + entity (T.stripPrefix "0" -> Just suf) + | Just (n, "") <- oct (T.unpack suf) = lookupEntity $ "#" <> tshow n + entity (TR.decimal @Integer -> Right (x, "")) = lookupEntity $ "#" <> tshow x + entity x = lookupEntity x + +-- god knows there must be a higher order way of writing this thing, where we +-- have multiple different possible parser states within the link argument +-- varying depending on whether the link is expanded or not, but at least I +-- understand what I've done. This would be less wacky with a lexing step. +link :: PandocMonad m => PodParser m Inlines +link = do + identifier <- textToIdentifier <$> getOption readerExtensions + (name, dest) <- try expandedLinkArg <|> compactLinkArg + return $ mkLink identifier name dest + where + compactLinkArg = do + char '<' + name <- linkName whitespace ">" + dest <- linkDest whitespace (char '>') ">" + char '>' + return (mconcat <$> name, dest) + expandedLinkArg = do + openLen <- length <$> many1 (char '<') + let closeStr = textStr (T.pack $ replicate openLen '>') + let close = skipMany1 spaceChar *> closeStr + let sp = try $ many1 spaceChar *> notFollowedBy closeStr $> B.space + many1 spaceChar + name <- linkName sp "" + dest <- linkDest sp close "" + close + return (mconcat <$> name, dest) + mkLink identifier name dest = + let name' = fromMaybe (defaultLinkName dest) name in + case dest of + LinkUrl _ href -> B.link href "" name' + LinkMan nm Nothing -> B.linkWith (mempty, mempty, [("manual", stringify nm)]) "" "" name' + LinkMan nm (Just sc) -> B.linkWith (mempty, mempty, [("manual", stringify nm), ("section", stringify sc)]) "" "" name' + LinkInternal sc -> B.link ("#" <> identifier (stringify sc)) "" name' + + linkName sp ex = optionMaybe $ try $ many + (try format + <|> sp + <|> B.str <$> many1Char (podCharLess ('|':ex))) <* char '|' + linkDest sp close ex = try (url ex) <|> internal sp close ex <|> man sp close ex + -- perlpodspec sez: + -- Note that you can distinguish URL-links from anything else by the + -- fact that they match m/\A\w+:[^:\s]\S*\z/. + -- This is obviously not an RFC-compliant matcher for a URI scheme, but + -- this is what the specification and the canonical implementation (Pod::Simple) + -- do for deciding that a link target "looks like" a URL, as opposed to a + -- manual page reference, so what we are doing here is roughly equivalent + -- even though it is nonsense + url ex = do + scheme <- many1Char (letter <|> digit <|> char '_') + colon <- T.singleton <$> char ':' <* notFollowedBy (char ':') + rst <- many (format <|> B.str <$> many1Char (podCharLess ex)) + return $ LinkUrl + (B.str scheme <> B.str colon <> mconcat rst) + (scheme <> colon <> stringify rst) + quotedSection sp close ex = do + let mystr = B.str <$> many1Char (podCharLess ('\"':ex) <|> try (char '"' <* notFollowedBy close)) + char '"' + ins <- mconcat <$> many1 (format <|> sp <|> mystr) + char '"' + return ins + section sp close ex = try (quotedSection sp close ex) <|> mconcat <$> many1 (format <|> sp <|> B.str <$> many1Char (podCharLess ex)) + internal sp close ex = do + char '/' + LinkInternal <$> section sp close ex + notSlash sp ex = format <|> sp <|> B.str <$> many1Char (podCharLess ('/':ex)) + man sp close ex = do + page <- mconcat <$> many (notSlash sp ex) + sec <- optionMaybe $ char '/' *> section sp close ex + return $ LinkMan page sec + +whitespace :: PandocMonad m => PodParser m Inlines +whitespace = try $ do + many1 spaceChar *> optional newline <|> many spaceChar *> void newline + notFollowedBy blankline + return B.space + +podCharLess :: PandocMonad m => String -> PodParser m Char +podCharLess exclude = try (satisfy isAsciiUpper <* notFollowedBy (char '<')) + <|> satisfy (\c -> not (isSpaceChar c || isAsciiUpper c || elem c exclude)) + +podChar :: PandocMonad m => PodParser m Char +podChar = try (satisfy isAsciiUpper <* notFollowedBy (char '<')) + <|> satisfy (\c -> not (isSpaceChar c || isAsciiUpper c)) + +str :: PandocMonad m => PodParser m Inlines +str = B.str <$> many1Char podChar + +nonEmptyLine :: PandocMonad m => PodParser m T.Text +nonEmptyLine = try $ do + pre <- manyChar spaceChar + something <- T.singleton <$> nonspaceChar + post <- anyLineNewline + return $ pre <> something <> post + +verbatim :: PandocMonad m => PodParser m Blocks +verbatim = do + start <- startVerbatimLine + lns <- many (nonEmptyLine <|> + try (do b <- blanklines + l <- startVerbatimLine + return $ b <> l)) + optional blanklines + return $ B.codeBlock $ mconcat $ start:lns + where + startVerbatimLine = many1Char spaceChar <> nonEmptyLine + +-- =begin/=end/=for and data paragraphs +-- The =begin/=end (and single-paragraph =for variant) markers in Pod are +-- designed as an extension point for specific formatters +-- +-- this doesn't strictly match the intent of "=begin :ident" pod blocks, which +-- are still meant to be processed specially by the formatter, and only land in +-- the output upon request, i.e. pod2html will process "=begin :html" blocks as +-- Pod and include them in the regular output. Since the regions contain Pod +-- markup it seems to me that the best thing to do is parse the markup and put +-- a classname on it, allowing users to respond as desired with filters. +-- Pandoc doesn't have a built-in concept of parsed Divs that are only rendered +-- to certain formats, just raw blocks. +-- +-- perlpodspec allows nesting of =begin/=end regions but we currently don't +-- because it would be annoying and we have something somewhat useful we +-- can do with these blocks which is treat them as RawBlocks, which matches +-- the intent reasonably well, and that gets weirder if we parse a nested +-- structure. It seems unlikely this would be encountered in the wild. + +regionIdentifier :: PandocMonad m => PodParser m T.Text +regionIdentifier = many1Char (alphaNum <|> oneOf "-_") + +for :: PandocMonad m => PodParser m Blocks +for = do + string "for" + many1 spaceChar + forDiv <|> forData + +forDiv :: PandocMonad m => PodParser m Blocks +forDiv = do + char ':' + cls <- regionIdentifier + many1 spaceChar + B.divWith (mempty, [cls], mempty) <$> paragraph + +forData :: PandocMonad m => PodParser m Blocks +forData = do + fmt <- regionIdentifier + ln1 <- anyLineNewline + lns <- many nonEmptyLine + optional blanklines + return $ B.rawBlock fmt (T.concat (ln1 : lns)) + +begin :: PandocMonad m => PodParser m Blocks +begin = do + cmd "begin" + beginDiv <|> beginData + +beginDiv :: PandocMonad m => PodParser m Blocks +beginDiv = do + char ':' + cls <- regionIdentifier + anyLine -- "parameters" may appear in this position + blanklines + bs <- mconcat <$> many block + textStr ("=end :" <> cls) <* blanklines + return $ B.divWith (mempty, [cls], mempty) bs + +beginData :: PandocMonad m => PodParser m Blocks +beginData = do + fmt <- regionIdentifier + anyLine + blanklines + lns <- mconcat <$> many (try rawCut <|> rawLine) + textStr ("=end " <> fmt) <* blanklines + return $ B.rawBlock fmt lns + where + rawCut = do + char '=' *> cut + pod "=pod to close =cut within =begin/=end" + return mempty + rawLine = do + try (notFollowedBy (char '=' *> letter)) + anyLineNewline diff --git a/test/Tests/Old.hs b/test/Tests/Old.hs index bc3a61270dce..f8cc806250cb 100644 --- a/test/Tests/Old.hs +++ b/test/Tests/Old.hs @@ -246,6 +246,10 @@ tests pandocPath = [ test' "ansi" ["-f", "markdown", "-t", "ansi"] "ansi-test.txt" "ansi-test.ansi" ] + , testGroup "pod" + [ test' "pod" ["-f", "pod", "-t", "native"] + "pod-reader.pod" "pod-reader.native" + ] ] where test' = test pandocPath diff --git a/test/Tests/Readers/Pod.hs b/test/Tests/Readers/Pod.hs new file mode 100644 index 000000000000..c812e0754497 --- /dev/null +++ b/test/Tests/Readers/Pod.hs @@ -0,0 +1,175 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Tests.Readers.Pod + Copyright : © 2024 Evan Silberman + License : GNU GPL, version 2 or above + + Maintainer : + Stability : alpha + Portability : portable + +Tests for the Pod reader. +-} + +module Tests.Readers.Pod (tests) where + +import Data.Text (Text, pack) +import Test.Tasty +import Test.Tasty.HUnit (HasCallStack) +import Tests.Helpers +import Text.Pandoc +import Text.Pandoc.Arbitrary () +import Text.Pandoc.Builder + +pod :: Text -> Pandoc +pod t = (purely $ readPod def) ("=pod\n\n" <> t <> "\n\n=cut\n") + +manLink :: Text -> Maybe Text -> Inlines -> Inlines +manLink nm Nothing = linkWith (mempty, mempty, [("manual", nm)]) "" "" +manLink nm (Just sc) = linkWith (mempty, mempty, [("manual", nm), ("section", sc)]) "" "" + +bogusEntity :: String -> TestTree +bogusEntity t = t =: "E<" <> pack t <> ">" =?> para ("E<" <> str (pack t) <> ">") + +infix 4 =: +(=:) :: (ToString c, HasCallStack) + => String -> (Text, c) -> TestTree +(=:) = test pod + +tests :: [TestTree] +tests = [ + testGroup "inlines" + [ "code with nested inlines" =: + "C (*PRUNE) I/>" =?> + para (code "/A (*PRUNE) B/") + , "compact in compact" =: + "I emphasis>" =?> + para (emph $ (strong "strong") <> " emphasis") + , "expanded in compact" =: + "I> emphasis>" =?> + para (emph $ (strong "strong") <> " emphasis") + , "compact in expanded" =: + "I<<< B emphasis >>>" =?> + para (emph $ (strong "strong") <> " emphasis") + , "expanded in expanded" =: + "I<<< B<<< strong >>> emphasis >>>" =?> + para (emph $ (strong "strong") <> " emphasis") + ] + , testGroup "links" + [ testGroup "compact" + [ "URL" =: + "L" =?> + para (link "https://example.org" "" "https://example.org") + , "URL with link text" =: + "L" =?> + para (link "https://example.org/index.html" "" "link") + , "perl manual" =: + "L" =?> + para (manLink "Foo::Bar" Nothing "Foo::Bar") + , "manual with quoted section" =: + "L" =?> + para (manLink "crontab(5)" (Just "DESCRIPTION") (doubleQuoted "DESCRIPTION" <> " in crontab(5)")) + , "manual with section and formatted link text" =: + "L> link|HTTP::Simple/is_info>" =?> + para (manLink "HTTP::Simple" (Just "is_info") (strong "extravagant" <> " link")) + , "internal link" =: + "L" =?> + para (link "#section-name" "" (doubleQuoted "section name")) + , "internal link with formatting" =: + "L command>" =?> + para (link "#the-pod2html-command" "" (doubleQuoted ("The " <> code "pod2html" <> " command"))) + , "link with angle bracket" =: + "L" =?> + para (manLink "m<" Nothing "m<") + , "empty name" =: + "L<|https://example.org>" =?> + para (link "https://example.org" "" mempty) + ] + , testGroup "expanded" + [ "URL" =: + "L<< https://example.org >>" =?> + para (link "https://example.org" "" "https://example.org") + , "URL with link text" =: + "L<< link|https://example.org/index.html >>" =?> + para (link "https://example.org/index.html" "" "link") + , "perl manual" =: + "L<<< Foo::Bar >>>" =?> + para (manLink "Foo::Bar" Nothing "Foo::Bar") + , "manual with quoted section" =: + "L<< crontab(5)/\"DESCRIPTION\" >>" =?> + para (manLink "crontab(5)" (Just "DESCRIPTION") (doubleQuoted "DESCRIPTION" <> " in crontab(5)")) + , "manual with section and formatted link text" =: + "L<< B<< extravagant >> link|HTTP::Simple/is_info >>" =?> + para (manLink "HTTP::Simple" (Just "is_info") (strong "extravagant" <> " link")) + , "internal link" =: + "L<< /section name >>" =?> + para (link "#section-name" "" (doubleQuoted "section name")) + , "internal link with formatting" =: + "L<<<<< /The C command >>>>>" =?> + para (link "#the-pod2html-command" "" (doubleQuoted ("The " <> code "pod2html" <> " command"))) + , "link with angle bracket" =: + "L<< m< >>" =?> + para (manLink "m<" Nothing "m<") + , "empty name" =: + "L<< |https://example.org >>" =?> + para (link "https://example.org" "" mempty) + ] + ] + , testGroup "entities" + [ testGroup "required" + [ "quot" =: + "E" =?> + para "\"" + , "amp" =: + "E" =?> + para "&" + , "apos" =: + "E" =?> + para "'" + , "lt" =: + "E" =?> + para "<" + , "gt" =: + "E" =?> + para ">" + , "sol" =: + "E" =?> + para "/" + , "verbar" =: + "E" =?> + para "|" + , "lchevron" =: + "E" =?> + para "«" + , "rchevron" =: + "E" =?> + para "»" + ] + , testGroup "numeric" + [ "decimal" =: + "E<162>" =?> + para "¢" + , "octal" =: + "E<0242>" =?> + para "¢" + , "hexadecimal" =: + "E<0xA2>" =?> + para "¢" + , "hexadecimal variant" =: + "E<0x00A2>" =?> + para "¢" + , "actually decimal" =: + "E<099>" =?> + para "c" + ] + , testGroup "bogus" + [ bogusEntity "0XA2" + , bogusEntity "not a real entity" + , bogusEntity "162 1" + , bogusEntity "99 bottles of beer" + , bogusEntity "0xhh" + , bogusEntity "077x" + , bogusEntity "0x63 skidoo" + ] + ] + ] diff --git a/test/pod-reader.native b/test/pod-reader.native new file mode 100644 index 000000000000..07603a80f3e7 --- /dev/null +++ b/test/pod-reader.native @@ -0,0 +1,394 @@ +[ Header + 1 + ( "" , [] , [] ) + [ Str "POD" , Space , Str "TEST" , Space , Str "SUITE" ] +, Para + [ Str "This" + , Space + , Str "is" + , Space + , Str "a" + , Space + , Str "test" + , Space + , Link + ( "" , [] , [] ) + [ Str "Pod" ] + ( "https://perldoc.pl/perlpod" , "" ) + , Space + , Str "document" + , Space + , Str "for" + , Space + , Str "pandoc." + ] +, Para + [ Str "=head2" , Space , Str "Head" , Space , Str "2" ] +, Header + 3 + ( "" , [] , [] ) + [ Str "Head" + , Space + , Str "3:" + , Space + , Emph [ Str "The>" , Space , Str "This is a raw block destined for the HTML format\n\n" +, BulletList + [ [ Para [ Str "Bulleted" , Space , Str "list" ] ] + , [ Para [ Str "Ordered" , Space , Str "list" ] + , OrderedList + ( 1 , DefaultStyle , DefaultDelim ) + [ [ Para + [ Str "Here's" + , Space + , Str "a" + , Space + , Str "verbatim" + , Space + , Str "paragraph" + , Space + , Str "in" + , Space + , Str "this" + , Space + , Str "list" + , Space + , Str "item:" + ] + , CodeBlock + ( "" , [] , [] ) + " this is a code block\nthis is still part of the code block\n so is this.\nIt seems that the prefixed spaces in verbatim blocks in pod don't get stripped.\n\n This should continue the previous code block despite the intervening blank\n line, because the first line starts with a space\n\n\n\n the above blank lines with varying numbers of spaces should also be in\n the code block\n pod formatters should (but not must) expand tabs by default\nso we're not special casing pandoc's behavior there in any way\n" + , Para + [ Str "Wow," + , Space + , Str "that" + , Space + , Str "was" + , Space + , Str "fun." + ] + ] + , [ Para [ Str "Definition" , Space , Str "list" ] + , DefinitionList + [ ( [ Span + ( "" , [] , [] ) + [ Str "Marvin" + , Space + , Str "the" + , Space + , Str "Martian" + ] + ] + , [ [ Para + [ Str "A" + , Space + , Str "cartoon" + , Space + , Str "alien" + ] + ] + ] + ) + , ( [ Span + ( "" , [] , [] ) + [ Emph + [ Str "The" + , Space + , Str "Sun" + , Space + , Str "Also" + , Space + , Str "Rises" + ] + ] + ] + , [ [ Para + [ Str "A" + , Space + , Str "novel" + , Space + , Str "by" + , Space + , Str "Ernest" + , Space + , Str "Hemingway" + ] + ] + ] + ) + , ( [ Span + ( "" , [] , [] ) + [ Code ( "" , [] , [] ) "undefined" ] + ] + , [ [] ] + ) + , ( [ Span + ( "" , [] , [] ) + [ Str "And" + , Space + , Str "now," + , Space + , Str "a" + , Space + , Str "quotation" + ] + ] + , [ [ BlockQuote + [ Para + [ Str "Where's" + , Space + , Str "my" + , Space + , Str "space" + , Space + , Str "modulator?" + ] + ] + ] + ] + ) + ] + ] + , [ Para + [ Str "And" + , Space + , Str "the" + , Space + , Str "list" + , Space + , Str "continues." + ] + ] + ] + ] + , [ Para + [ Str "And" + , Space + , Str "so" + , Space + , Str "does" + , Space + , Str "the" + , Space + , Str "other" + , Space + , Str "one," + , Space + , Str "even" + , Space + , Str "if" + , Space + , Str "I" + , Space + , Str "forget" + , Space + , Str "the" + , Space + , Str "asterisk." + ] + ] + ] +, Div + ( "" , [ "neat" ] , [] ) + [ Para + [ Str "This" + , Space + , Str "is" + , Space + , Str "a" + , Space + , Str "div" + , Space + , Str "for" + , Space + , Str "our" + , Space + , Str "purposes." + ] + , Para + [ Str "It" + , Space + , Str "should" + , Space + , Str "parse" + , Space + , Strong [ Str "content" ] + , Space + , Str "inside" + , Space + , Str "of" + , Space + , Str "it." + ] + , BulletList + [ [ Para [ Str "Like" , Space , Str "this" ] ] ] + ] +, Div + ( "" , [ "excitement" ] , [] ) + [ Para + [ Str "this" + , Space + , Str "is" + , Space + , Str "its" + , Space + , Str "own" + , Space + , Str "div" + ] + ] +, RawBlock + (Format "html") "

and this is its own raw block

\n" +, RawBlock (Format "html") "\n

so is this

\n" +, Header + 2 + ( "" , [] , [] ) + [ Code ( "" , [] , [] ) "=cut" + , Space + , Str "before" + , Space + , Str "any" + , Space + , Code ( "" , [] , [] ) "=item" + , Space + , Str "in" + , Space + , Code ( "" , [] , [] ) "=over" + ] +, BulletList + [ [ Para [ Str "a" ] , Para [ Str "b" ] ] + , [ Para [ Str "c" ] ] + ] +] diff --git a/test/pod-reader.pod b/test/pod-reader.pod new file mode 100644 index 000000000000..49ddd50a95f6 --- /dev/null +++ b/test/pod-reader.pod @@ -0,0 +1,155 @@ +text before any Pod commands is not parsed, which is unusual for formats +supported by pandoc, but + +=head1 POD TEST SUITE + +This is a test L document for pandoc. + +=encoding utf8 + +=head2 Head 2 + +=head3 Head 3: I<< The> > + +=head4 How to use the L command, +an introduction + +=head5 C and his 52 assistants + +=head6 The =head5 and =head6 commands are newer and my syntax highlighting +doesn't recognize them. In any case, it should be possible to have a very long +paragraph in the heading. + +=head6 +It should also be possible to start the heading paragraph on the next line + +=cut + +This doesn't get parsed at all. + +=begin html + +This is a raw block destined for the HTML format + +=end html + +=over + +=item * + +Bulleted list + +=item * + +Ordered list + +=over + +=item 1. + +Here's a verbatim paragraph in this list item: + + this is a code block +this is still part of the code block + so is this. +It seems that the prefixed spaces in verbatim blocks in pod don't get stripped. + + This should continue the previous code block despite the intervening blank + line, because the first line starts with a space + + + + the above blank lines with varying numbers of spaces should also be in + the code block + pod formatters should (but not must) expand tabs by default +so we're not special casing pandoc's behavior there in any way + +Wow, that was fun. + +=item 2. + +Definition list + +=over + +=item Marvin the MZ<>artian + +A cartoon alien + +=item I + +A novel by Ernest Hemingway + +=item C + +=item And now, a quotation + +=over + +Where's my space modulator? + +=back + +=back + +=item 3. + +And the list continues. + +=back + +=item + +And so does the other one, even if I forget the asterisk. + +=back + +=begin :neat + +This is a div for our purposes. + +It should parse B<< content >> inside of it. + +=over + +=item + +Like this + +=back + +=end :neat + +=for :excitement this is its own div + +=for html

and this is its own raw block

+ +=for html +

so is this

+ +=head2 C<=cut> before any C<=item> in C<=over> + +=over + +=cut + +blah + +=item * + +a + +=cut + +blah blah + +=pod + +b + +=item * + +c + +=back diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs index 6d749b84549f..d310b932f316 100644 --- a/test/test-pandoc.hs +++ b/test/test-pandoc.hs @@ -30,6 +30,7 @@ import qualified Tests.Readers.RTF import qualified Tests.Readers.Txt2Tags import qualified Tests.Readers.Man import qualified Tests.Readers.Mdoc +import qualified Tests.Readers.Pod import qualified Tests.Shared import qualified Tests.Writers.AsciiDoc import qualified Tests.Writers.ConTeXt @@ -101,6 +102,7 @@ tests pandocPath = testGroup "pandoc tests" , testGroup "Mdoc" Tests.Readers.Mdoc.tests , testGroup "FB2" Tests.Readers.FB2.tests , testGroup "DokuWiki" Tests.Readers.DokuWiki.tests + , testGroup "Pod" Tests.Readers.Pod.tests ] ] From 2b4fa91d0549118b91cebe337b1a7a5daaa3368a Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 26 Dec 2024 22:09:12 -0800 Subject: [PATCH 2/2] Make pod the default reader for .pod/.pl/.pm --- src/Text/Pandoc/Format.hs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Text/Pandoc/Format.hs b/src/Text/Pandoc/Format.hs index 8182002c4bd9..3dfc0ce37e6f 100644 --- a/src/Text/Pandoc/Format.hs +++ b/src/Text/Pandoc/Format.hs @@ -208,6 +208,9 @@ formatFromFilePath x = ".opml" -> defFlavor "opml" ".org" -> defFlavor "org" ".pdf" -> defFlavor "pdf" -- so we get an "unknown reader" error + ".pl" -> defFlavor "pod" + ".pm" -> defFlavor "pod" + ".pod" -> defFlavor "pod" ".pptx" -> defFlavor "pptx" ".ris" -> defFlavor "ris" ".roff" -> defFlavor "ms"