Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix buggy code blocks in markdown renderer #3764

Merged
merged 13 commits into from
Jan 8, 2025
25 changes: 21 additions & 4 deletions app/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from config.denylist import USERNAME_DENYLIST
from grandchallenge.components.exceptions import PriorStepFailed
from grandchallenge.core.utils import strtobool
from grandchallenge.core.utils.markdown import BS4Extension
from grandchallenge.core.utils.markdown import ExtendHTMLTagClasses

MEGABYTE = 1024 * 1024
GIGABYTE = 1024 * MEGABYTE
Expand Down Expand Up @@ -789,13 +789,30 @@ def get_private_ip():
"pymdownx.magiclink",
"pymdownx.tasklist",
"pymdownx.tilde",
BS4Extension(),
]
MARKDOWN_POST_PROCESSORS = []
MARKDOWN_POST_PROCESSORS = [
ExtendHTMLTagClasses(
{
"img": ["img-fluid"],
"blockquote": ["blockquote"],
"table": [
"table",
"table-hover",
"table-borderless",
],
"thead": ["thead-light"],
"code": ["codehilite"],
}
jmsmkn marked this conversation as resolved.
Show resolved Hide resolved
),
]
MARKDOWNX_MARKDOWNIFY_FUNCTION = (
"grandchallenge.core.templatetags.bleach.md2html"
)
MARKDOWNX_MARKDOWN_EXTENSION_CONFIGS = {}
MARKDOWNX_MARKDOWN_EXTENSION_CONFIGS = {
"markdown.extensions.codehilite": {
"wrapcode": False,
}
}
MARKDOWNX_IMAGE_MAX_SIZE = {"size": (2000, 0), "quality": 90}
MARKDOWNX_EDITOR_RESIZABLE = "False"

Expand Down
2 changes: 1 addition & 1 deletion app/grandchallenge/core/templatetags/bleach.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def md2html(
post_processors = [*settings.MARKDOWN_POST_PROCESSORS]

if process_youtube_tags:
post_processors.append(YOUTUBE_TAG_SUBSTITUTION)
post_processors = [YOUTUBE_TAG_SUBSTITUTION, *post_processors]
jmsmkn marked this conversation as resolved.
Show resolved Hide resolved

for processor in post_processors:
cleaned_html = processor(cleaned_html)
Expand Down
84 changes: 41 additions & 43 deletions app/grandchallenge/core/utils/markdown.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,59 @@
from xml.etree import ElementTree

from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup
from django.utils.html import escape
from django.utils.safestring import SafeString, mark_safe
from markdown import Extension
from markdown.treeprocessors import Treeprocessor


class BS4Extension(Extension):
def extendMarkdown(self, md): # noqa: N802
md.registerExtension(self)
md.treeprocessors.register(BS4Treeprocessor(md), "bs4_extension", 0)
class BeautifulSoupWithCharEntities(BeautifulSoup):
"""
Soup generator that elegantly handles reserved HTML entity placeholders.

For instance, the soup HTMLparser replaces these (e.g. '<') into their
unicode equivalents (e.g. '<').

class BS4Treeprocessor(Treeprocessor):
def run(self, root):
el_class_dict = {
"img": "img-fluid",
"blockquote": "blockquote",
"table": "table table-hover table-borderless",
"thead": "thead-light",
"code": "codehilite",
}
This messes up things if the HTML is decoded into a string again.
"""

jmsmkn marked this conversation as resolved.
Show resolved Hide resolved
for el in root.iter():
if el.tag in el_class_dict:
self.set_css_class(
element=el, class_name=el_class_dict[el.tag]
)
def __init__(self, /, markup, features="html.parser", **kwargs):
markup = markup.replace("&", "&amp;")

super().__init__(markup=markup, features=features, **kwargs)

for i, html_block in enumerate(self.md.htmlStash.rawHtmlBlocks):
bs4block = BeautifulSoup(html_block, "html.parser")
def decode(self, **kwargs):
# Prevent entity subsitution (e.g. "&" -> "&amp")
kwargs["formatter"] = None
return super().decode(**kwargs)

for tag, tag_class in el_class_dict.items():
for el in bs4block.find_all(tag):
self.set_css_class(element=el, class_name=tag_class)
self.md.htmlStash.rawHtmlBlocks[i] = str(bs4block)

@staticmethod
def set_css_class(*, element, class_name):
if isinstance(element, ElementTree.Element):
current_class = element.attrib.get("class", "")
class ExtendHTMLTagClasses:
def __init__(self, tag_classes):
# Make extensions safe
self.clean_tag_classes = {
jmsmkn marked this conversation as resolved.
Show resolved Hide resolved
t: [escape(c).strip() for c in classes]
jmsmkn marked this conversation as resolved.
Show resolved Hide resolved
for t, classes in tag_classes.items()
}

def __call__(self, html):
input_is_safe = isinstance(html, SafeString)

jmsmkn marked this conversation as resolved.
Show resolved Hide resolved
if class_name not in current_class:
new_class = f"{current_class} {class_name}".strip()
element.set("class", new_class)
soup = BeautifulSoupWithCharEntities(markup=html)

elif isinstance(element, Tag):
if "class" not in element.attrs:
element.attrs["class"] = []
tags = [*self.clean_tag_classes.keys()]
if tags:
for element in soup.find_all(tags):
classes = element.get("class", [])
for new_class in self.clean_tag_classes[element.name]:
if new_class not in classes:
classes.append(new_class)
element["class"] = classes

current_class = element["class"]
new_markup = soup.decode()

for name in class_name.split(" "):
if class_name not in current_class:
current_class.append(name)
if input_is_safe:
new_markup = mark_safe(new_markup)

jmsmkn marked this conversation as resolved.
Show resolved Hide resolved
else:
raise TypeError("Unsupported element")
return new_markup


class LinkBlankTargetExtension(Extension):
Expand Down
123 changes: 64 additions & 59 deletions app/tests/core_tests/test_markdown.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import textwrap

import pytest
from django.conf import settings
from markdown import markdown
from django.utils.safestring import SafeString, mark_safe

from grandchallenge.core.templatetags.bleach import md2html
from grandchallenge.core.utils.markdown import BS4Treeprocessor
from grandchallenge.core.utils.markdown import ExtendHTMLTagClasses


@pytest.mark.parametrize(
Expand All @@ -30,7 +29,7 @@ def test_function():
),
textwrap.dedent(
"""\
<p><img class="img-fluid" src="test.png"></p>
<p><img class="img-fluid" src="test.png"/></p>
<blockquote class="blockquote">
<p>Quote Me</p>
</blockquote>
Expand All @@ -55,9 +54,9 @@ def test_function():
</tr>
</tbody>
</table>
<div class="codehilite"><pre><span></span><code class="codehilite"><span class="k">def</span> <span class="nf">test_function</span><span class="p">():</span>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">test_function</span><span class="p">():</span>
<span class="k">pass</span>
</code></pre></div>"""
</pre></div>"""
),
),
(
Expand Down Expand Up @@ -123,10 +122,10 @@ def test_function():
),
textwrap.dedent(
"""\
<p><img class="img-fluid" src="test.png"></p>
<p><img class="img-fluid" src="test-no-class.png"></p>
<p><img class="img-fluid" src="test-empty-class.png"></p>
<p><img class="ml-3 img-fluid" src="test-existing-class.png"></p>
<p><img class="img-fluid" src="test.png"/></p>
<p><img class="img-fluid" src="test-no-class.png"/></p>
<p><img class="img-fluid" src="test-empty-class.png"/></p>
<p><img class="ml-3 img-fluid" src="test-existing-class.png"/></p>
<blockquote class="blockquote">
<p>Quote Me</p>
</blockquote>
Expand All @@ -136,7 +135,6 @@ def test_function():
<blockquote class="ml-3 blockquote">
<p>Quote Me Existing Class</p>
</blockquote>

<table class="table table-hover table-borderless">
<thead class="thead-light">
<tr>
Expand Down Expand Up @@ -167,7 +165,6 @@ def test_function():
<tbody>
</tbody>
</table>

<table class="ml-3 table table-hover table-borderless">
<thead class="ml-3 thead-light">
<tr>
Expand All @@ -177,14 +174,11 @@ def test_function():
<tbody>
</tbody>
</table>

<div class="codehilite"><pre><span></span><code class="codehilite"><span class="k">def</span> <span class="nf">test_function</span><span class="p">():</span>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">test_function</span><span class="p">():</span>
<span class="k">pass</span>
</code></pre></div>

</pre></div>
<div><pre><code class="codehilite">no class</code></pre></div>
<div class="ml-3"><pre><code class="ml-3 codehilite">existing class</code></pre></div>

<p><del>Delete me</del></p>
<p>CH<sub>3</sub>CH<sub>2</sub>OH
text<sub>a subscript</sub></p>
Expand All @@ -194,6 +188,14 @@ def test_function():
</ul>"""
),
),
(
"&lt;script&gt;alert(&quot;foo&quot;)&lt;/script&gt;",
"<p>&lt;script&gt;alert(&quot;foo&quot;)&lt;/script&gt;</p>",
),
(
"[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)",
'<p><a href="https://google.com"><img class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg"/></a></p>',
),
),
)
def test_markdown_rendering(markdown_with_html, expected_output):
Expand All @@ -202,54 +204,57 @@ def test_markdown_rendering(markdown_with_html, expected_output):


@pytest.mark.parametrize(
"markdown_with_html, expected_output",
jmsmkn marked this conversation as resolved.
Show resolved Hide resolved
(
(
"""<img src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""",
"""<p><img class="img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img alt="" class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg" /></a></p>""",
"html, tag_classes, expected_output, is_safe",
[
( # Safe input
mark_safe("<div>Content</div>"),
{},
"<div>Content</div>",
True,
),
(
"""<img class="" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""",
"""<p><img class="img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img alt="" class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg" /></a></p>""",
( # Unsafe input
"<div>Content</div>",
{},
"<div>Content</div>",
False,
),
(
"""<img class="ml-2" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""",
"""<p><img class="ml-2 img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img alt="" class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg" /></a></p>""",
( # Escaped classes
mark_safe("<div>Content</div>"),
{"div": ['<script>alert("foo")</script>']},
'<div class="&lt;script&gt;alert(&quot;foo&quot;)&lt;/script&gt;">Content</div>',
True,
),
(
"""<img class="img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""",
"""<p><img class="img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img alt="" class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg" /></a></p>""",
( # Empty class
'<div class="">Content</div>',
{"div": ["foo"]},
'<div class="foo">Content</div>',
False,
),
(
"""<img class="ml-2 img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""",
"""<p><img class="ml-2 img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img alt="" class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg" /></a></p>""",
( # Existing class
'<div class="ml-2">Content</div>',
{"div": ["foo"]},
'<div class="ml-2 foo">Content</div>',
False,
),
),
( # Extension class already present
'<div class="foo">Content</div>',
{"div": ["foo"]},
'<div class="foo">Content</div>',
False,
),
( # Existing class + extension class
'<div class="ml-2 foo">Content</div>',
{"div": ["foo"]},
'<div class="ml-2 foo">Content</div>',
False,
),
],
)
def test_setting_class_to_html_img_within_markdown(
markdown_with_html, expected_output
):
output = markdown(
text=markdown_with_html,
extensions=settings.MARKDOWNX_MARKDOWN_EXTENSIONS,
extension_configs=settings.MARKDOWNX_MARKDOWN_EXTENSION_CONFIGS,
)
def test_extend_html_tag_classes(html, tag_classes, expected_output, is_safe):
extender = ExtendHTMLTagClasses(tag_classes)
output = extender(html)

assert output == expected_output


def test_tree_processor_set_css_class_type_error():
with pytest.raises(TypeError):
BS4Treeprocessor.set_css_class(
element="element", class_name="img-fluid"
)
# Check if the output matches the expected safety status
assert isinstance(output, SafeString) == is_safe
19 changes: 9 additions & 10 deletions app/tests/core_tests/test_tag_substitutions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import textwrap

import pytest
from django.utils.safestring import SafeString, mark_safe

Expand Down Expand Up @@ -131,16 +133,13 @@ def test_no_change_with_no_tag_or_arg_match(content):
assert s == content


EXPECTED_YOUTUBE_EMBED = """<p><div class="embed-responsive embed-responsive-16by9 rounded border-0">
<iframe
src="https://www.youtube-nocookie.com/embed/QCYYhkTlnhQ?disablekb=1&amp;rel=0"
allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture; web-share; fullscreen"
class="embed-responsive-item"
loading="lazy"
sandbox="allow-scripts allow-same-origin allow-presentation allow-popups"
></iframe>
</div>
</p>"""
EXPECTED_YOUTUBE_EMBED = textwrap.dedent(
"""\
<p><div class="embed-responsive embed-responsive-16by9 rounded border-0">
<iframe allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture; web-share; fullscreen" class="embed-responsive-item" loading="lazy" sandbox="allow-scripts allow-same-origin allow-presentation allow-popups" src="https://www.youtube-nocookie.com/embed/QCYYhkTlnhQ?disablekb=1&amp;rel=0"></iframe>
</div>
</p>"""
)


@pytest.mark.parametrize(
Expand Down
Loading