Fixes to markdown parser following trials on Julia Blog posts (#218)

* small fixes while playing with jekyll migration * fixing issue with indented code blocks separated by empty lines * fixing a glitch with double backticks
tlienart · Sep 10, 2019 · f5513f6 · f5513f6
1 parent ea8cc3e
commit f5513f6
Show file tree

Hide file tree

Showing 9 changed files with 124 additions and 15 deletions.
diff --git a/src/converter/fixer.jl b/src/converter/fixer.jl
@@ -14,7 +14,7 @@ function find_and_fix_md_links(hs::String)::String
     # the regexes very readable...
 
     # here we're looking for [id]: link; 1=id 2=link
-    m_link_defs = collect(eachmatch(r"&#91;((?:(?!&#93;).)*?)&#93;:\s((?:(?!\<\/p\>)\S)+)", hs))
+    m_link_defs = collect(eachmatch(r"&#91;((?:(?!&#93;).)*?)&#93;:\s+((?:(?!\<\/p\>)\S)+)", hs))
 
     def_names = [def.captures[1] for def in m_link_defs]
     def_links = [def.captures[2] for def in m_link_defs]

diff --git a/src/converter/md.jl b/src/converter/md.jl
@@ -39,9 +39,11 @@ function convert_md(mds::String, pre_lxdefs::Vector{LxDef}=Vector{LxDef}();
     #> 2. Open-Close blocks (OCBlocks)
     #>> a. find them
     blocks, tokens = find_all_ocblocks(tokens, MD_OCB_ALL)
-    #>> b. now that blocks have been found, line-returns can be dropped
+    #>> b. merge CODE_BLOCK_IND which are separated by emptyness
+    merge_indented_code_blocks!(blocks, mds)
+    #>> c. now that blocks have been found, line-returns can be dropped
     filter!(τ -> τ.name ∉ L_RETURNS, tokens)
-    #>> c. filter out "fake headers" (opening ### that are not at the start of a line)
+    #>> d. filter out "fake headers" (opening ### that are not at the start of a line)
     filter!(β -> validate_header_block(β), blocks)
 
     #> 3. LaTeX commands

diff --git a/src/converter/md_blocks.jl b/src/converter/md_blocks.jl
@@ -11,10 +11,10 @@ function convert_block(β::AbstractBlock, lxcontext::LxContext)::AbstractString
     # Return relevant interpolated string based on case
     βn = β.name
     βn ∈  MD_HEADER        && return convert_header(β)
-    βn == :CODE_INLINE     && return md2html(β.ss; stripp=true, code=true)
+    βn == :CODE_INLINE     && return html_code_inline(content(β) |> Markdown.htmlesc)
     βn == :CODE_BLOCK_LANG && return convert_code_block(β.ss)
     βn == :CODE_BLOCK_IND  && return convert_indented_code_block(β.ss)
-    βn == :CODE_BLOCK      && return md2html(β.ss; code=true)
+    βn == :CODE_BLOCK      && return md2html(β.ss)
     βn == :ESCAPE          && return chop(β.ss, head=3, tail=3)
 
     # Math block --> needs to call further processing to resolve possible latex
@@ -185,5 +185,5 @@ function convert_indented_code_block(ss::SubString)::String
     # 1. decrease indentation of all lines (either frontal \n\t or \n⎵⎵⎵⎵)
     code = replace(ss, r"\n(?:\t| {4})" => "\n")
     # 2. return; lang is a LOCAL_PAGE_VARS that is julia by default and can be set
-    return html_code(code, "{{fill lang}}")
+    return html_code(strip(code), "{{fill lang}}")
 end
diff --git a/src/converter/md_utils.jl b/src/converter/md_utils.jl
@@ -6,13 +6,11 @@ that don't need to be further considered and don't contain anything else than ma
 The boolean `stripp` indicates whether to remove the inserted `<p>` and `</p>` by the base markdown
 processor, this is relevant for things that are parsed within latex commands etc.
 """
-function md2html(ss::AbstractString; stripp::Bool=false, code::Bool=false)::AbstractString
-
+function md2html(ss::AbstractString; stripp::Bool=false)::AbstractString
+    # if there's nothing, return that...
     isempty(ss) && return ss
-
     # Use Julia's Markdown parser followed by Julia's MD->HTML conversion
     partial = ss |> fix_inserts |> Markdown.parse |> Markdown.html
-
     # In some cases, base converter adds <p>...</p>\n which we might not want
     stripp || return partial
     startswith(partial, "<p>")    && (partial = chop(partial, head=3))

diff --git a/src/misc_html.jl b/src/misc_html.jl
@@ -32,7 +32,7 @@ html_img(src::AbstractString, alt::AbstractString="") =
 """
 $(SIGNATURES)
 
-Convenience function to introduce an image.
+Convenience function to introduce a code block.
 """
 function html_code(c::AbstractString, lang::AbstractString="")
     isempty(c) && return ""
@@ -43,6 +43,13 @@ end
 """
 $(SIGNATURES)
 
+Convenience function to introduce inline code.
+"""
+html_code_inline(c::AbstractString) = "<code>$c</code>"
+
+"""
+$(SIGNATURES)
+
 Insertion of a visible red message in HTML to show there was a problem.
 """
 html_err(mess::String="") = "<p><span style=\"color:red;\">// $mess //</span></p>"
diff --git a/src/parser/ocblocks.jl b/src/parser/ocblocks.jl
@@ -5,7 +5,8 @@ Find active blocks between an opening token (`otoken`) and a closing token `ctok
 nested (e.g. braces). Return the list of such blocks. If `deactivate` is `true`, all the tokens
 within the block will be marked as inactive (for further, separate processing).
 """
-function find_ocblocks(tokens::Vector{Token}, ocproto::OCProto; inmath=false)
+function find_ocblocks(tokens::Vector{Token}, ocproto::OCProto;
+                       inmath=false)::Tuple{Vector{OCBlock}, Vector{Token}}
 
     ntokens       = length(tokens)
     active_tokens = ones(Bool, length(tokens))
@@ -127,8 +128,8 @@ function find_indented_blocks(tokens::Vector{Token}, st::String)::Vector{Token}
     # blocks.
     for i in 1:length(lr_idx)-1
         # capture start and finish of the line (from line return to line return)
-        start  = from(tokens[lr_idx[i]])  # first :LINE_RETURN
-        finish = from(tokens[lr_idx[i+1]]) # next :LINE_RETURN
+        start  = from(tokens[lr_idx[i]])   # first :LINE_RETURN
+        finish = from(tokens[lr_idx[i+1]]) # next  :LINE_RETURN
         line   = subs(st, start, finish)
         indent = ""
         if startswith(line, "\n    ")
@@ -155,3 +156,55 @@ function find_indented_blocks(tokens::Vector{Token}, st::String)::Vector{Token}
     end
     return tokens
 end
+
+
+"""
+$SIGNATURES
+
+When two indented code blocks follow each other and there's nothing in between (empty line(s)),
+merge them into a super block.
+"""
+function merge_indented_code_blocks!(blocks::Vector{OCBlock}, mds::String)::Nothing
+    # indices of CODE_BLOCK_IND
+    idx = [i for i in eachindex(blocks) if blocks[i].name == :CODE_BLOCK_IND]
+    isempty(idx) && return
+    # check if they're separated by something or nothing
+    inter_space = [(subs(mds, to(blocks[idx[i]]), from(blocks[idx[i+1]])) |> strip |> length) > 0
+                    for i in 1:length(idx)-1]
+
+    curseq     = Int[] # to keep track of current list of blocks to merge
+    del_blocks = Int[] # to keep track of blocks that will be removed afterwards
+
+    # if there's no inter_space, add to the list, if there is, close and merge
+    for i in eachindex(inter_space)
+        if inter_space[i] && !isempty(curseq)
+            # close and merge all in curseq and empty curseq
+            form_super_block!(blocks, idx, curseq, del_blocks)
+        elseif !inter_space[i]
+            push!(curseq, i)
+        end
+    end
+    !isempty(curseq) && form_super_block!(blocks, idx, curseq, del_blocks)
+    # remove the blocks that have been merged
+    deleteat!(blocks, del_blocks)
+    return
+end
+
+
+"""
+$SIGNATURES
+
+Helper function to [`merge_indented_code_blocks`](@ref).
+"""
+function form_super_block!(blocks::Vector{OCBlock}, idx::Vector{Int},
+                           curseq::Vector{Int}, del_blocks::Vector{Int})::Nothing
+    push!(curseq, curseq[end]+1)
+    first_block = blocks[idx[curseq[1]]]
+    last_block  = blocks[idx[curseq[end]]]
+    # replace the first block with the super block
+    blocks[idx[curseq[1]]] = OCBlock(:CODE_BLOCK_IND, (otok(first_block) => ctok(last_block)))
+    # append all blocks but the first to the delete list
+    append!(del_blocks, curseq[2:end])
+    empty!(curseq)
+    return
+end
diff --git a/test/converter/markdown3.jl b/test/converter/markdown3.jl
@@ -55,7 +55,7 @@ end
     tokens, = steps[:tokenization]
     @test tokens[7].name == :CHAR_LINEBREAK
     h = st |> seval
-    @test isapproxstr(st |> seval, """
+    @test isapproxstr(st |> seval, raw"""
                         <p>Hello &#92; blah &#92; end
                         and <code>B \ c</code> end <br/> and
                         <pre><code>A \ b</code></pre>
@@ -285,4 +285,51 @@ end
                         </ul>
                         <p>end</p>
                         """)
+
+    st = raw"""
+        A
+
+            function foo()
+
+                return 2
+
+            end
+
+            function bar()
+                return 3
+            end
+
+        B
+
+            function baz()
+                return 5
+
+            end
+
+        C
+        """ * J.EOS
+    isapproxstr(st |> seval, raw"""
+                            <p>A <pre><code class="language-julia">function foo()
+
+                                return 2
+
+                            end
+
+                            function bar()
+                                return 3
+                            end</code></pre>
+                            B <pre><code class="language-julia">function baz()
+                                return 5
+
+                            end</code></pre>
+                            C</p>
+                            """)
+end
+
+
+@testset "More ``" begin
+    st = raw"""
+         A ``blah``.
+         """ * J.EOS
+    isapproxstr(st |> seval, """<p>A <code>blah</code>.</p>""")
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -25,6 +25,7 @@ println("🍺")
 println("CONVERTER/MD")
 include("converter/markdown.jl")
 include("converter/markdown2.jl")
+include("converter/markdown3.jl")
 include("converter/hyperref.jl")
 println("🍺")
 

diff --git a/test/test_utils.jl b/test/test_utils.jl
@@ -41,6 +41,7 @@ function explore_md_steps(mds)
 
     # tokenize
     tokens = J.find_tokens(mds, J.MD_TOKENS, J.MD_1C_TOKENS)
+    tokens = J.find_indented_blocks(tokens, mds)
     steps[:tokenization] = (tokens=tokens,)
 
     # ocblocks