From f5513f6d5c24e801617dc5db535fd468b1a825ad Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Tue, 10 Sep 2019 12:56:57 +0200 Subject: [PATCH] Fixes to markdown parser following trials on Julia Blog posts (#218) * small fixes while playing with jekyll migration * fixing issue with indented code blocks separated by empty lines * fixing a glitch with double backticks --- src/converter/fixer.jl | 2 +- src/converter/md.jl | 6 ++-- src/converter/md_blocks.jl | 6 ++-- src/converter/md_utils.jl | 6 ++-- src/misc_html.jl | 9 +++++- src/parser/ocblocks.jl | 59 +++++++++++++++++++++++++++++++++++-- test/converter/markdown3.jl | 49 +++++++++++++++++++++++++++++- test/runtests.jl | 1 + test/test_utils.jl | 1 + 9 files changed, 124 insertions(+), 15 deletions(-) diff --git a/src/converter/fixer.jl b/src/converter/fixer.jl index 06b299ed9..bb61c727d 100644 --- a/src/converter/fixer.jl +++ b/src/converter/fixer.jl @@ -14,7 +14,7 @@ function find_and_fix_md_links(hs::String)::String # the regexes very readable... # here we're looking for [id]: link; 1=id 2=link - m_link_defs = collect(eachmatch(r"[((?:(?!]).)*?)]:\s((?:(?!\<\/p\>)\S)+)", hs)) + m_link_defs = collect(eachmatch(r"[((?:(?!]).)*?)]:\s+((?:(?!\<\/p\>)\S)+)", hs)) def_names = [def.captures[1] for def in m_link_defs] def_links = [def.captures[2] for def in m_link_defs] diff --git a/src/converter/md.jl b/src/converter/md.jl index 066c7819a..3e07b3b5d 100644 --- a/src/converter/md.jl +++ b/src/converter/md.jl @@ -39,9 +39,11 @@ function convert_md(mds::String, pre_lxdefs::Vector{LxDef}=Vector{LxDef}(); #> 2. Open-Close blocks (OCBlocks) #>> a. find them blocks, tokens = find_all_ocblocks(tokens, MD_OCB_ALL) - #>> b. now that blocks have been found, line-returns can be dropped + #>> b. merge CODE_BLOCK_IND which are separated by emptyness + merge_indented_code_blocks!(blocks, mds) + #>> c. now that blocks have been found, line-returns can be dropped filter!(τ -> τ.name ∉ L_RETURNS, tokens) - #>> c. filter out "fake headers" (opening ### that are not at the start of a line) + #>> d. filter out "fake headers" (opening ### that are not at the start of a line) filter!(β -> validate_header_block(β), blocks) #> 3. LaTeX commands diff --git a/src/converter/md_blocks.jl b/src/converter/md_blocks.jl index ebd7a7c2b..2e8aeeb9d 100644 --- a/src/converter/md_blocks.jl +++ b/src/converter/md_blocks.jl @@ -11,10 +11,10 @@ function convert_block(β::AbstractBlock, lxcontext::LxContext)::AbstractString # Return relevant interpolated string based on case βn = β.name βn ∈ MD_HEADER && return convert_header(β) - βn == :CODE_INLINE && return md2html(β.ss; stripp=true, code=true) + βn == :CODE_INLINE && return html_code_inline(content(β) |> Markdown.htmlesc) βn == :CODE_BLOCK_LANG && return convert_code_block(β.ss) βn == :CODE_BLOCK_IND && return convert_indented_code_block(β.ss) - βn == :CODE_BLOCK && return md2html(β.ss; code=true) + βn == :CODE_BLOCK && return md2html(β.ss) βn == :ESCAPE && return chop(β.ss, head=3, tail=3) # Math block --> needs to call further processing to resolve possible latex @@ -185,5 +185,5 @@ function convert_indented_code_block(ss::SubString)::String # 1. decrease indentation of all lines (either frontal \n\t or \n⎵⎵⎵⎵) code = replace(ss, r"\n(?:\t| {4})" => "\n") # 2. return; lang is a LOCAL_PAGE_VARS that is julia by default and can be set - return html_code(code, "{{fill lang}}") + return html_code(strip(code), "{{fill lang}}") end diff --git a/src/converter/md_utils.jl b/src/converter/md_utils.jl index 808eac407..4edabf8d6 100644 --- a/src/converter/md_utils.jl +++ b/src/converter/md_utils.jl @@ -6,13 +6,11 @@ that don't need to be further considered and don't contain anything else than ma The boolean `stripp` indicates whether to remove the inserted `

` and `

` by the base markdown processor, this is relevant for things that are parsed within latex commands etc. """ -function md2html(ss::AbstractString; stripp::Bool=false, code::Bool=false)::AbstractString - +function md2html(ss::AbstractString; stripp::Bool=false)::AbstractString + # if there's nothing, return that... isempty(ss) && return ss - # Use Julia's Markdown parser followed by Julia's MD->HTML conversion partial = ss |> fix_inserts |> Markdown.parse |> Markdown.html - # In some cases, base converter adds

...

\n which we might not want stripp || return partial startswith(partial, "

") && (partial = chop(partial, head=3)) diff --git a/src/misc_html.jl b/src/misc_html.jl index 8b53a70ac..a8928a7af 100644 --- a/src/misc_html.jl +++ b/src/misc_html.jl @@ -32,7 +32,7 @@ html_img(src::AbstractString, alt::AbstractString="") = """ $(SIGNATURES) -Convenience function to introduce an image. +Convenience function to introduce a code block. """ function html_code(c::AbstractString, lang::AbstractString="") isempty(c) && return "" @@ -43,6 +43,13 @@ end """ $(SIGNATURES) +Convenience function to introduce inline code. +""" +html_code_inline(c::AbstractString) = "$c" + +""" +$(SIGNATURES) + Insertion of a visible red message in HTML to show there was a problem. """ html_err(mess::String="") = "

// $mess //

" diff --git a/src/parser/ocblocks.jl b/src/parser/ocblocks.jl index be2131825..a24744836 100644 --- a/src/parser/ocblocks.jl +++ b/src/parser/ocblocks.jl @@ -5,7 +5,8 @@ Find active blocks between an opening token (`otoken`) and a closing token `ctok nested (e.g. braces). Return the list of such blocks. If `deactivate` is `true`, all the tokens within the block will be marked as inactive (for further, separate processing). """ -function find_ocblocks(tokens::Vector{Token}, ocproto::OCProto; inmath=false) +function find_ocblocks(tokens::Vector{Token}, ocproto::OCProto; + inmath=false)::Tuple{Vector{OCBlock}, Vector{Token}} ntokens = length(tokens) active_tokens = ones(Bool, length(tokens)) @@ -127,8 +128,8 @@ function find_indented_blocks(tokens::Vector{Token}, st::String)::Vector{Token} # blocks. for i in 1:length(lr_idx)-1 # capture start and finish of the line (from line return to line return) - start = from(tokens[lr_idx[i]]) # first :LINE_RETURN - finish = from(tokens[lr_idx[i+1]]) # next :LINE_RETURN + start = from(tokens[lr_idx[i]]) # first :LINE_RETURN + finish = from(tokens[lr_idx[i+1]]) # next :LINE_RETURN line = subs(st, start, finish) indent = "" if startswith(line, "\n ") @@ -155,3 +156,55 @@ function find_indented_blocks(tokens::Vector{Token}, st::String)::Vector{Token} end return tokens end + + +""" +$SIGNATURES + +When two indented code blocks follow each other and there's nothing in between (empty line(s)), +merge them into a super block. +""" +function merge_indented_code_blocks!(blocks::Vector{OCBlock}, mds::String)::Nothing + # indices of CODE_BLOCK_IND + idx = [i for i in eachindex(blocks) if blocks[i].name == :CODE_BLOCK_IND] + isempty(idx) && return + # check if they're separated by something or nothing + inter_space = [(subs(mds, to(blocks[idx[i]]), from(blocks[idx[i+1]])) |> strip |> length) > 0 + for i in 1:length(idx)-1] + + curseq = Int[] # to keep track of current list of blocks to merge + del_blocks = Int[] # to keep track of blocks that will be removed afterwards + + # if there's no inter_space, add to the list, if there is, close and merge + for i in eachindex(inter_space) + if inter_space[i] && !isempty(curseq) + # close and merge all in curseq and empty curseq + form_super_block!(blocks, idx, curseq, del_blocks) + elseif !inter_space[i] + push!(curseq, i) + end + end + !isempty(curseq) && form_super_block!(blocks, idx, curseq, del_blocks) + # remove the blocks that have been merged + deleteat!(blocks, del_blocks) + return +end + + +""" +$SIGNATURES + +Helper function to [`merge_indented_code_blocks`](@ref). +""" +function form_super_block!(blocks::Vector{OCBlock}, idx::Vector{Int}, + curseq::Vector{Int}, del_blocks::Vector{Int})::Nothing + push!(curseq, curseq[end]+1) + first_block = blocks[idx[curseq[1]]] + last_block = blocks[idx[curseq[end]]] + # replace the first block with the super block + blocks[idx[curseq[1]]] = OCBlock(:CODE_BLOCK_IND, (otok(first_block) => ctok(last_block))) + # append all blocks but the first to the delete list + append!(del_blocks, curseq[2:end]) + empty!(curseq) + return +end diff --git a/test/converter/markdown3.jl b/test/converter/markdown3.jl index 95573867d..a4d119489 100644 --- a/test/converter/markdown3.jl +++ b/test/converter/markdown3.jl @@ -55,7 +55,7 @@ end tokens, = steps[:tokenization] @test tokens[7].name == :CHAR_LINEBREAK h = st |> seval - @test isapproxstr(st |> seval, """ + @test isapproxstr(st |> seval, raw"""

Hello \ blah \ end and B \ c end
and

A \ b
@@ -285,4 +285,51 @@ end

end

""") + + st = raw""" + A + + function foo() + + return 2 + + end + + function bar() + return 3 + end + + B + + function baz() + return 5 + + end + + C + """ * J.EOS + isapproxstr(st |> seval, raw""" +

A

function foo()
+
+                                return 2
+
+                            end
+
+                            function bar()
+                                return 3
+                            end
+ B
function baz()
+                                return 5
+
+                            end
+ C

+ """) +end + + +@testset "More ``" begin + st = raw""" + A ``blah``. + """ * J.EOS + isapproxstr(st |> seval, """

A blah.

""") end diff --git a/test/runtests.jl b/test/runtests.jl index 540c1ca90..c482b4766 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -25,6 +25,7 @@ println("🍺") println("CONVERTER/MD") include("converter/markdown.jl") include("converter/markdown2.jl") +include("converter/markdown3.jl") include("converter/hyperref.jl") println("🍺") diff --git a/test/test_utils.jl b/test/test_utils.jl index 2f4b93277..41e1d51f3 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -41,6 +41,7 @@ function explore_md_steps(mds) # tokenize tokens = J.find_tokens(mds, J.MD_TOKENS, J.MD_1C_TOKENS) + tokens = J.find_indented_blocks(tokens, mds) steps[:tokenization] = (tokens=tokens,) # ocblocks