From 3da4d41c071ec81ecaf4dff73b70c56fad56cc96 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 18 Dec 2024 14:51:25 -0800 Subject: [PATCH] Markdown reader: more efficient base64 data URI parsing. This patch borrows some code from @silby's PR #10434 and should be regarded as co-authored. This is a lighter-weight patch that only touches the Markdown reader. The basic idea is to speed up parsing of base64 URIs by parsing them with a special path. This should improve the problem noted at #10075. Benchmarks (optimized compilation): Converting the large test.md from #10075 (7.6Mb embedded image) from markdown to json, before: 6182 GCs, 1578M in use, 5.471 MUT, 1.473 GC after: 951 GCs, 80M in use, .247 MUT, 0.035 GC For now we leave #10075 open to investigate improvements in HTML rendering with these large data URIs. Co-authored-by: Evan Silberman --- src/Text/Pandoc/Readers/Markdown.hs | 37 ++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 51782f00015f..b513fdc985ca 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -28,6 +28,7 @@ import Data.List (transpose, elemIndex, sortOn, foldl') import qualified Data.Map as M import Data.Maybe import qualified Data.Set as Set +import qualified Data.Attoparsec.Text as A import Data.Text (Text) import qualified Data.Text as T import qualified Data.ByteString as BS @@ -1834,12 +1835,46 @@ source = do let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk let betweenAngles = try $ char '<' >> mconcat <$> (manyTill litChar (char '>')) - src <- try betweenAngles <|> sourceURL + src <- try betweenAngles <|> try pBase64DataURI <|> sourceURL tit <- option "" $ try $ spnl >> linkTitle skipSpaces char ')' return (escapeURI $ trimr src, tit) +pBase64DataURI :: PandocMonad m => ParsecT Sources s m Text +pBase64DataURI = mconcat <$> sequence + [ textStr "data:" + , T.singleton <$> alphaNum + , restrictedName + , T.singleton <$> char '/' + , restrictedName + , textStr ";" + , mconcat <$> many (try mediaParam) + , textStr "base64," + , pBase64Data + ] + where + restrictedName = manyChar (satisfy (A.inClass "A-Za-z0-9!#$&^_.+-")) + mediaParam = mconcat <$> sequence + [ restrictedName + , textStr "=" + , manyChar (noneOf ";") + , textStr ";" + ] + +pBase64Data :: PandocMonad m => ParsecT Sources s m Text +pBase64Data = do + Sources inps <- getInput + case inps of + [] -> mzero + (fp,t):rest -> do + satisfy (A.inClass "A-Za-z0-9+/") -- parse one character or parsec won't know + -- we have consumed input + let (a,r) = T.span (A.inClass "A-Za-z0-9+/") t + let (b, trest) = T.span (=='=') r + setInput $ Sources ((fp,trest):rest) + return (a <> b) + linkTitle :: PandocMonad m => MarkdownParser m Text linkTitle = quotedTitle '"' <|> quotedTitle '\''