Skip to content

Commit

Permalink
Further improvements to base64 data URI parsing.
Browse files Browse the repository at this point in the history
Text.Pandoc.URI: export `pBase64DataURI`.  Modify `isURI` to use this
and avoid calling network-uri's inefficient `parseURI` for data URIs.

Markdown reader: use T.P.URI's `pBase64DataURI` in parsing data
URIs.

Partially addresses #10075.

Obsoletes #10434 (borrowing most of its ideas).

Co-authored-by: Evan Silberman <[email protected]>
  • Loading branch information
jgm and silby committed Dec 19, 2024
1 parent 5ae45a0 commit b27a8cc
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 42 deletions.
49 changes: 12 additions & 37 deletions src/Text/Pandoc/Readers/Markdown.hs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ import Text.Pandoc.Readers.HTML (htmlInBalanced, htmlTag, isBlockTag,
isCommentTag, isInlineTag, isTextTag)
import Text.Pandoc.Readers.LaTeX (applyMacros, rawLaTeXBlock, rawLaTeXInline)
import Text.Pandoc.Shared
import Text.Pandoc.URI (escapeURI, isURI)
import Text.Pandoc.URI (escapeURI, isURI, pBase64DataURI)
import Text.Pandoc.XML (fromEntities)
import Text.Pandoc.Readers.Metadata (yamlBsToMeta, yamlBsToRefs, yamlMetaBlock)
-- import Debug.Trace (traceShowId)
Expand Down Expand Up @@ -1835,47 +1835,22 @@ source = do
let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk
let betweenAngles = try $
char '<' >> mconcat <$> (manyTill litChar (char '>'))
src <- try betweenAngles <|> try pBase64DataURI <|> sourceURL
src <- try betweenAngles <|> try base64DataURI <|> sourceURL
tit <- option "" $ try $ spnl >> linkTitle
skipSpaces
char ')'
return (escapeURI $ trimr src, tit)

pBase64DataURI :: PandocMonad m => ParsecT Sources s m Text
pBase64DataURI = mconcat <$> sequence
[ textStr "data:"
, T.singleton <$> alphaNum
, restrictedName
, T.singleton <$> char '/'
, restrictedName
, textStr ";"
, mconcat <$> many (try mediaParam)
, textStr "base64,"
, pBase64Data
]
where
restrictedName = manyChar (satisfy (A.inClass "A-Za-z0-9!#$&^_.+-"))
mediaParam = mconcat <$> sequence
[ restrictedName
, textStr "="
, manyChar (noneOf ";")
, textStr ";"
]

pBase64Data :: PandocMonad m => ParsecT Sources s m Text
pBase64Data = do
Sources inps <- getInput
case inps of
[] -> mzero
(pos,t):rest -> do
satisfy (A.inClass "A-Za-z0-9+/") -- parse one character or parsec won't know
-- we have consumed input
let (a,r) = T.span (A.inClass "A-Za-z0-9+/") t
let (b, trest) = T.span (=='=') r
let b64 = a <> b
let pos' = incSourceColumn pos (T.length b64)
setInput $ Sources ((pos',trest):rest)
return b64
base64DataURI :: PandocMonad m => ParsecT Sources s m Text
base64DataURI = do
Sources ((pos, txt):rest) <- getInput
let r = A.parse pBase64DataURI txt
case r of
A.Done remaining consumed -> do
let pos' = incSourceColumn pos (T.length consumed)
setInput $ Sources ((pos', remaining):rest)
return consumed
_ -> mzero

linkTitle :: PandocMonad m => MarkdownParser m Text
linkTitle = quotedTitle '"' <|> quotedTitle '\''
Expand Down
39 changes: 34 additions & 5 deletions src/Text/Pandoc/URI.hs
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,16 @@ module Text.Pandoc.URI ( urlEncode
, isURI
, schemes
, uriPathToPath
, pBase64DataURI
) where
import qualified Network.HTTP.Types as HTTP
import qualified Text.Pandoc.UTF8 as UTF8
import qualified Data.Text as T
import qualified Data.Set as Set
import Data.Char (isSpace, isAscii)
import Network.URI (URI (uriScheme), parseURI, escapeURIString)
import qualified Data.Attoparsec.Text as A
import Control.Applicative (many)

urlEncode :: T.Text -> T.Text
urlEncode = UTF8.toText . HTTP.urlEncode True . UTF8.fromText
Expand Down Expand Up @@ -90,12 +93,16 @@ schemes = Set.fromList
-- | Check if the string is a valid URL with a IANA or frequently used but
-- unofficial scheme (see @schemes@).
isURI :: T.Text -> Bool
isURI =
-- we URI-escape non-ASCII characters because otherwise parseURI will choke:
maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack
isURI t =
-- If it's a base 64 data: URI, avoid the expensive call to parseURI:
case A.parseOnly (pBase64DataURI *> A.endOfInput) t of
Right () -> True
Left _ ->
-- we URI-escape non-ASCII characters because otherwise parseURI will choke:
maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack $ t
where
hasKnownScheme = (`Set.member` schemes) . T.toLower .
T.filter (/= ':') . T.pack . uriScheme
hasKnownScheme =
(`Set.member` schemes) . T.toLower . T.filter (/= ':') . T.pack . uriScheme

-- | Converts the path part of a file: URI to a regular path.
-- On windows, @/c:/foo@ should be @c:/foo@.
Expand All @@ -109,3 +116,25 @@ uriPathToPath (T.unpack -> path) =
#else
path
#endif

pBase64DataURI :: A.Parser T.Text
pBase64DataURI = fst <$> A.match base64uri
where
base64uri = do
A.string "data:"
restrictedName
A.char '/'
restrictedName
A.char ';'
many mediaParam
A.string "base64,"
A.skipWhile (A.inClass "A-Za-z0-9+/")
A.skipWhile (== '=')
restrictedName = do
A.satisfy (A.inClass "A-Za-z0-9")
A.skipWhile (A.inClass "A-Za-z0-9!#$&^_.+-")
mediaParam = do
restrictedName
A.char '='
A.skipWhile (/=';')
A.char ';'

0 comments on commit b27a8cc

Please sign in to comment.