Skip to content

Commit

Permalink
Fixes to markdown parser following trials on Julia Blog posts (#218)
Browse files Browse the repository at this point in the history
* small fixes while playing with jekyll migration

* fixing issue with indented code blocks separated by empty lines

* fixing a glitch with double backticks
  • Loading branch information
tlienart authored Sep 10, 2019
1 parent ea8cc3e commit f5513f6
Show file tree
Hide file tree
Showing 9 changed files with 124 additions and 15 deletions.
2 changes: 1 addition & 1 deletion src/converter/fixer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ function find_and_fix_md_links(hs::String)::String
# the regexes very readable...

# here we're looking for [id]: link; 1=id 2=link
m_link_defs = collect(eachmatch(r"&#91;((?:(?!&#93;).)*?)&#93;:\s((?:(?!\<\/p\>)\S)+)", hs))
m_link_defs = collect(eachmatch(r"&#91;((?:(?!&#93;).)*?)&#93;:\s+((?:(?!\<\/p\>)\S)+)", hs))

def_names = [def.captures[1] for def in m_link_defs]
def_links = [def.captures[2] for def in m_link_defs]
Expand Down
6 changes: 4 additions & 2 deletions src/converter/md.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,11 @@ function convert_md(mds::String, pre_lxdefs::Vector{LxDef}=Vector{LxDef}();
#> 2. Open-Close blocks (OCBlocks)
#>> a. find them
blocks, tokens = find_all_ocblocks(tokens, MD_OCB_ALL)
#>> b. now that blocks have been found, line-returns can be dropped
#>> b. merge CODE_BLOCK_IND which are separated by emptyness
merge_indented_code_blocks!(blocks, mds)
#>> c. now that blocks have been found, line-returns can be dropped
filter!-> τ.name L_RETURNS, tokens)
#>> c. filter out "fake headers" (opening ### that are not at the start of a line)
#>> d. filter out "fake headers" (opening ### that are not at the start of a line)
filter!-> validate_header_block(β), blocks)

#> 3. LaTeX commands
Expand Down
6 changes: 3 additions & 3 deletions src/converter/md_blocks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ function convert_block(β::AbstractBlock, lxcontext::LxContext)::AbstractString
# Return relevant interpolated string based on case
βn = β.name
βn MD_HEADER && return convert_header(β)
βn == :CODE_INLINE && return md2html.ss; stripp=true, code=true)
βn == :CODE_INLINE && return html_code_inline(content(β) |> Markdown.htmlesc)
βn == :CODE_BLOCK_LANG && return convert_code_block.ss)
βn == :CODE_BLOCK_IND && return convert_indented_code_block.ss)
βn == :CODE_BLOCK && return md2html.ss; code=true)
βn == :CODE_BLOCK && return md2html.ss)
βn == :ESCAPE && return chop.ss, head=3, tail=3)

# Math block --> needs to call further processing to resolve possible latex
Expand Down Expand Up @@ -185,5 +185,5 @@ function convert_indented_code_block(ss::SubString)::String
# 1. decrease indentation of all lines (either frontal \n\t or \n⎵⎵⎵⎵)
code = replace(ss, r"\n(?:\t| {4})" => "\n")
# 2. return; lang is a LOCAL_PAGE_VARS that is julia by default and can be set
return html_code(code, "{{fill lang}}")
return html_code(strip(code), "{{fill lang}}")
end
6 changes: 2 additions & 4 deletions src/converter/md_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@ that don't need to be further considered and don't contain anything else than ma
The boolean `stripp` indicates whether to remove the inserted `<p>` and `</p>` by the base markdown
processor, this is relevant for things that are parsed within latex commands etc.
"""
function md2html(ss::AbstractString; stripp::Bool=false, code::Bool=false)::AbstractString

function md2html(ss::AbstractString; stripp::Bool=false)::AbstractString
# if there's nothing, return that...
isempty(ss) && return ss

# Use Julia's Markdown parser followed by Julia's MD->HTML conversion
partial = ss |> fix_inserts |> Markdown.parse |> Markdown.html

# In some cases, base converter adds <p>...</p>\n which we might not want
stripp || return partial
startswith(partial, "<p>") && (partial = chop(partial, head=3))
Expand Down
9 changes: 8 additions & 1 deletion src/misc_html.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ html_img(src::AbstractString, alt::AbstractString="") =
"""
$(SIGNATURES)
Convenience function to introduce an image.
Convenience function to introduce a code block.
"""
function html_code(c::AbstractString, lang::AbstractString="")
isempty(c) && return ""
Expand All @@ -43,6 +43,13 @@ end
"""
$(SIGNATURES)
Convenience function to introduce inline code.
"""
html_code_inline(c::AbstractString) = "<code>$c</code>"

"""
$(SIGNATURES)
Insertion of a visible red message in HTML to show there was a problem.
"""
html_err(mess::String="") = "<p><span style=\"color:red;\">// $mess //</span></p>"
59 changes: 56 additions & 3 deletions src/parser/ocblocks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ Find active blocks between an opening token (`otoken`) and a closing token `ctok
nested (e.g. braces). Return the list of such blocks. If `deactivate` is `true`, all the tokens
within the block will be marked as inactive (for further, separate processing).
"""
function find_ocblocks(tokens::Vector{Token}, ocproto::OCProto; inmath=false)
function find_ocblocks(tokens::Vector{Token}, ocproto::OCProto;
inmath=false)::Tuple{Vector{OCBlock}, Vector{Token}}

ntokens = length(tokens)
active_tokens = ones(Bool, length(tokens))
Expand Down Expand Up @@ -127,8 +128,8 @@ function find_indented_blocks(tokens::Vector{Token}, st::String)::Vector{Token}
# blocks.
for i in 1:length(lr_idx)-1
# capture start and finish of the line (from line return to line return)
start = from(tokens[lr_idx[i]]) # first :LINE_RETURN
finish = from(tokens[lr_idx[i+1]]) # next :LINE_RETURN
start = from(tokens[lr_idx[i]]) # first :LINE_RETURN
finish = from(tokens[lr_idx[i+1]]) # next :LINE_RETURN
line = subs(st, start, finish)
indent = ""
if startswith(line, "\n ")
Expand All @@ -155,3 +156,55 @@ function find_indented_blocks(tokens::Vector{Token}, st::String)::Vector{Token}
end
return tokens
end


"""
$SIGNATURES
When two indented code blocks follow each other and there's nothing in between (empty line(s)),
merge them into a super block.
"""
function merge_indented_code_blocks!(blocks::Vector{OCBlock}, mds::String)::Nothing
# indices of CODE_BLOCK_IND
idx = [i for i in eachindex(blocks) if blocks[i].name == :CODE_BLOCK_IND]
isempty(idx) && return
# check if they're separated by something or nothing
inter_space = [(subs(mds, to(blocks[idx[i]]), from(blocks[idx[i+1]])) |> strip |> length) > 0
for i in 1:length(idx)-1]

curseq = Int[] # to keep track of current list of blocks to merge
del_blocks = Int[] # to keep track of blocks that will be removed afterwards

# if there's no inter_space, add to the list, if there is, close and merge
for i in eachindex(inter_space)
if inter_space[i] && !isempty(curseq)
# close and merge all in curseq and empty curseq
form_super_block!(blocks, idx, curseq, del_blocks)
elseif !inter_space[i]
push!(curseq, i)
end
end
!isempty(curseq) && form_super_block!(blocks, idx, curseq, del_blocks)
# remove the blocks that have been merged
deleteat!(blocks, del_blocks)
return
end


"""
$SIGNATURES
Helper function to [`merge_indented_code_blocks`](@ref).
"""
function form_super_block!(blocks::Vector{OCBlock}, idx::Vector{Int},
curseq::Vector{Int}, del_blocks::Vector{Int})::Nothing
push!(curseq, curseq[end]+1)
first_block = blocks[idx[curseq[1]]]
last_block = blocks[idx[curseq[end]]]
# replace the first block with the super block
blocks[idx[curseq[1]]] = OCBlock(:CODE_BLOCK_IND, (otok(first_block) => ctok(last_block)))
# append all blocks but the first to the delete list
append!(del_blocks, curseq[2:end])
empty!(curseq)
return
end
49 changes: 48 additions & 1 deletion test/converter/markdown3.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ end
tokens, = steps[:tokenization]
@test tokens[7].name == :CHAR_LINEBREAK
h = st |> seval
@test isapproxstr(st |> seval, """
@test isapproxstr(st |> seval, raw"""
<p>Hello &#92; blah &#92; end
and <code>B \ c</code> end <br/> and
<pre><code>A \ b</code></pre>
Expand Down Expand Up @@ -285,4 +285,51 @@ end
</ul>
<p>end</p>
""")

st = raw"""
A
function foo()
return 2
end
function bar()
return 3
end
B
function baz()
return 5
end
C
""" * J.EOS
isapproxstr(st |> seval, raw"""
<p>A <pre><code class="language-julia">function foo()
return 2
end
function bar()
return 3
end</code></pre>
B <pre><code class="language-julia">function baz()
return 5
end</code></pre>
C</p>
""")
end


@testset "More ``" begin
st = raw"""
A ``blah``.
""" * J.EOS
isapproxstr(st |> seval, """<p>A <code>blah</code>.</p>""")
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ println("🍺")
println("CONVERTER/MD")
include("converter/markdown.jl")
include("converter/markdown2.jl")
include("converter/markdown3.jl")
include("converter/hyperref.jl")
println("🍺")

Expand Down
1 change: 1 addition & 0 deletions test/test_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ function explore_md_steps(mds)

# tokenize
tokens = J.find_tokens(mds, J.MD_TOKENS, J.MD_1C_TOKENS)
tokens = J.find_indented_blocks(tokens, mds)
steps[:tokenization] = (tokens=tokens,)

# ocblocks
Expand Down

0 comments on commit f5513f6

Please sign in to comment.