From f5513f6d5c24e801617dc5db535fd468b1a825ad Mon Sep 17 00:00:00 2001
From: Thibaut Lienart
Date: Tue, 10 Sep 2019 12:56:57 +0200
Subject: [PATCH] Fixes to markdown parser following trials on Julia Blog posts
(#218)
* small fixes while playing with jekyll migration
* fixing issue with indented code blocks separated by empty lines
* fixing a glitch with double backticks
---
src/converter/fixer.jl | 2 +-
src/converter/md.jl | 6 ++--
src/converter/md_blocks.jl | 6 ++--
src/converter/md_utils.jl | 6 ++--
src/misc_html.jl | 9 +++++-
src/parser/ocblocks.jl | 59 +++++++++++++++++++++++++++++++++++--
test/converter/markdown3.jl | 49 +++++++++++++++++++++++++++++-
test/runtests.jl | 1 +
test/test_utils.jl | 1 +
9 files changed, 124 insertions(+), 15 deletions(-)
diff --git a/src/converter/fixer.jl b/src/converter/fixer.jl
index 06b299ed9..bb61c727d 100644
--- a/src/converter/fixer.jl
+++ b/src/converter/fixer.jl
@@ -14,7 +14,7 @@ function find_and_fix_md_links(hs::String)::String
# the regexes very readable...
# here we're looking for [id]: link; 1=id 2=link
- m_link_defs = collect(eachmatch(r"[((?:(?!]).)*?)]:\s((?:(?!\<\/p\>)\S)+)", hs))
+ m_link_defs = collect(eachmatch(r"[((?:(?!]).)*?)]:\s+((?:(?!\<\/p\>)\S)+)", hs))
def_names = [def.captures[1] for def in m_link_defs]
def_links = [def.captures[2] for def in m_link_defs]
diff --git a/src/converter/md.jl b/src/converter/md.jl
index 066c7819a..3e07b3b5d 100644
--- a/src/converter/md.jl
+++ b/src/converter/md.jl
@@ -39,9 +39,11 @@ function convert_md(mds::String, pre_lxdefs::Vector{LxDef}=Vector{LxDef}();
#> 2. Open-Close blocks (OCBlocks)
#>> a. find them
blocks, tokens = find_all_ocblocks(tokens, MD_OCB_ALL)
- #>> b. now that blocks have been found, line-returns can be dropped
+ #>> b. merge CODE_BLOCK_IND which are separated by emptyness
+ merge_indented_code_blocks!(blocks, mds)
+ #>> c. now that blocks have been found, line-returns can be dropped
filter!(τ -> τ.name ∉ L_RETURNS, tokens)
- #>> c. filter out "fake headers" (opening ### that are not at the start of a line)
+ #>> d. filter out "fake headers" (opening ### that are not at the start of a line)
filter!(β -> validate_header_block(β), blocks)
#> 3. LaTeX commands
diff --git a/src/converter/md_blocks.jl b/src/converter/md_blocks.jl
index ebd7a7c2b..2e8aeeb9d 100644
--- a/src/converter/md_blocks.jl
+++ b/src/converter/md_blocks.jl
@@ -11,10 +11,10 @@ function convert_block(β::AbstractBlock, lxcontext::LxContext)::AbstractString
# Return relevant interpolated string based on case
βn = β.name
βn ∈ MD_HEADER && return convert_header(β)
- βn == :CODE_INLINE && return md2html(β.ss; stripp=true, code=true)
+ βn == :CODE_INLINE && return html_code_inline(content(β) |> Markdown.htmlesc)
βn == :CODE_BLOCK_LANG && return convert_code_block(β.ss)
βn == :CODE_BLOCK_IND && return convert_indented_code_block(β.ss)
- βn == :CODE_BLOCK && return md2html(β.ss; code=true)
+ βn == :CODE_BLOCK && return md2html(β.ss)
βn == :ESCAPE && return chop(β.ss, head=3, tail=3)
# Math block --> needs to call further processing to resolve possible latex
@@ -185,5 +185,5 @@ function convert_indented_code_block(ss::SubString)::String
# 1. decrease indentation of all lines (either frontal \n\t or \n⎵⎵⎵⎵)
code = replace(ss, r"\n(?:\t| {4})" => "\n")
# 2. return; lang is a LOCAL_PAGE_VARS that is julia by default and can be set
- return html_code(code, "{{fill lang}}")
+ return html_code(strip(code), "{{fill lang}}")
end
diff --git a/src/converter/md_utils.jl b/src/converter/md_utils.jl
index 808eac407..4edabf8d6 100644
--- a/src/converter/md_utils.jl
+++ b/src/converter/md_utils.jl
@@ -6,13 +6,11 @@ that don't need to be further considered and don't contain anything else than ma
The boolean `stripp` indicates whether to remove the inserted `` and `
` by the base markdown
processor, this is relevant for things that are parsed within latex commands etc.
"""
-function md2html(ss::AbstractString; stripp::Bool=false, code::Bool=false)::AbstractString
-
+function md2html(ss::AbstractString; stripp::Bool=false)::AbstractString
+ # if there's nothing, return that...
isempty(ss) && return ss
-
# Use Julia's Markdown parser followed by Julia's MD->HTML conversion
partial = ss |> fix_inserts |> Markdown.parse |> Markdown.html
-
# In some cases, base converter adds ...
\n which we might not want
stripp || return partial
startswith(partial, "") && (partial = chop(partial, head=3))
diff --git a/src/misc_html.jl b/src/misc_html.jl
index 8b53a70ac..a8928a7af 100644
--- a/src/misc_html.jl
+++ b/src/misc_html.jl
@@ -32,7 +32,7 @@ html_img(src::AbstractString, alt::AbstractString="") =
"""
$(SIGNATURES)
-Convenience function to introduce an image.
+Convenience function to introduce a code block.
"""
function html_code(c::AbstractString, lang::AbstractString="")
isempty(c) && return ""
@@ -43,6 +43,13 @@ end
"""
$(SIGNATURES)
+Convenience function to introduce inline code.
+"""
+html_code_inline(c::AbstractString) = "$c
"
+
+"""
+$(SIGNATURES)
+
Insertion of a visible red message in HTML to show there was a problem.
"""
html_err(mess::String="") = "
// $mess //
"
diff --git a/src/parser/ocblocks.jl b/src/parser/ocblocks.jl
index be2131825..a24744836 100644
--- a/src/parser/ocblocks.jl
+++ b/src/parser/ocblocks.jl
@@ -5,7 +5,8 @@ Find active blocks between an opening token (`otoken`) and a closing token `ctok
nested (e.g. braces). Return the list of such blocks. If `deactivate` is `true`, all the tokens
within the block will be marked as inactive (for further, separate processing).
"""
-function find_ocblocks(tokens::Vector{Token}, ocproto::OCProto; inmath=false)
+function find_ocblocks(tokens::Vector{Token}, ocproto::OCProto;
+ inmath=false)::Tuple{Vector{OCBlock}, Vector{Token}}
ntokens = length(tokens)
active_tokens = ones(Bool, length(tokens))
@@ -127,8 +128,8 @@ function find_indented_blocks(tokens::Vector{Token}, st::String)::Vector{Token}
# blocks.
for i in 1:length(lr_idx)-1
# capture start and finish of the line (from line return to line return)
- start = from(tokens[lr_idx[i]]) # first :LINE_RETURN
- finish = from(tokens[lr_idx[i+1]]) # next :LINE_RETURN
+ start = from(tokens[lr_idx[i]]) # first :LINE_RETURN
+ finish = from(tokens[lr_idx[i+1]]) # next :LINE_RETURN
line = subs(st, start, finish)
indent = ""
if startswith(line, "\n ")
@@ -155,3 +156,55 @@ function find_indented_blocks(tokens::Vector{Token}, st::String)::Vector{Token}
end
return tokens
end
+
+
+"""
+$SIGNATURES
+
+When two indented code blocks follow each other and there's nothing in between (empty line(s)),
+merge them into a super block.
+"""
+function merge_indented_code_blocks!(blocks::Vector{OCBlock}, mds::String)::Nothing
+ # indices of CODE_BLOCK_IND
+ idx = [i for i in eachindex(blocks) if blocks[i].name == :CODE_BLOCK_IND]
+ isempty(idx) && return
+ # check if they're separated by something or nothing
+ inter_space = [(subs(mds, to(blocks[idx[i]]), from(blocks[idx[i+1]])) |> strip |> length) > 0
+ for i in 1:length(idx)-1]
+
+ curseq = Int[] # to keep track of current list of blocks to merge
+ del_blocks = Int[] # to keep track of blocks that will be removed afterwards
+
+ # if there's no inter_space, add to the list, if there is, close and merge
+ for i in eachindex(inter_space)
+ if inter_space[i] && !isempty(curseq)
+ # close and merge all in curseq and empty curseq
+ form_super_block!(blocks, idx, curseq, del_blocks)
+ elseif !inter_space[i]
+ push!(curseq, i)
+ end
+ end
+ !isempty(curseq) && form_super_block!(blocks, idx, curseq, del_blocks)
+ # remove the blocks that have been merged
+ deleteat!(blocks, del_blocks)
+ return
+end
+
+
+"""
+$SIGNATURES
+
+Helper function to [`merge_indented_code_blocks`](@ref).
+"""
+function form_super_block!(blocks::Vector{OCBlock}, idx::Vector{Int},
+ curseq::Vector{Int}, del_blocks::Vector{Int})::Nothing
+ push!(curseq, curseq[end]+1)
+ first_block = blocks[idx[curseq[1]]]
+ last_block = blocks[idx[curseq[end]]]
+ # replace the first block with the super block
+ blocks[idx[curseq[1]]] = OCBlock(:CODE_BLOCK_IND, (otok(first_block) => ctok(last_block)))
+ # append all blocks but the first to the delete list
+ append!(del_blocks, curseq[2:end])
+ empty!(curseq)
+ return
+end
diff --git a/test/converter/markdown3.jl b/test/converter/markdown3.jl
index 95573867d..a4d119489 100644
--- a/test/converter/markdown3.jl
+++ b/test/converter/markdown3.jl
@@ -55,7 +55,7 @@ end
tokens, = steps[:tokenization]
@test tokens[7].name == :CHAR_LINEBREAK
h = st |> seval
- @test isapproxstr(st |> seval, """
+ @test isapproxstr(st |> seval, raw"""
Hello \ blah \ end
and B \ c
end
and
A \ b
@@ -285,4 +285,51 @@ end
end
""")
+
+ st = raw"""
+ A
+
+ function foo()
+
+ return 2
+
+ end
+
+ function bar()
+ return 3
+ end
+
+ B
+
+ function baz()
+ return 5
+
+ end
+
+ C
+ """ * J.EOS
+ isapproxstr(st |> seval, raw"""
+ A
function foo()
+
+ return 2
+
+ end
+
+ function bar()
+ return 3
+ end
+ B function baz()
+ return 5
+
+ end
+ C
+ """)
+end
+
+
+@testset "More ``" begin
+ st = raw"""
+ A ``blah``.
+ """ * J.EOS
+ isapproxstr(st |> seval, """A blah
.
""")
end
diff --git a/test/runtests.jl b/test/runtests.jl
index 540c1ca90..c482b4766 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -25,6 +25,7 @@ println("🍺")
println("CONVERTER/MD")
include("converter/markdown.jl")
include("converter/markdown2.jl")
+include("converter/markdown3.jl")
include("converter/hyperref.jl")
println("🍺")
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 2f4b93277..41e1d51f3 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -41,6 +41,7 @@ function explore_md_steps(mds)
# tokenize
tokens = J.find_tokens(mds, J.MD_TOKENS, J.MD_1C_TOKENS)
+ tokens = J.find_indented_blocks(tokens, mds)
steps[:tokenization] = (tokens=tokens,)
# ocblocks