Skip to content

Commit

Permalink
Add HTML postprocessor to extend classes
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisvanrun committed Dec 20, 2024
1 parent 72d3402 commit 42d41dc
Show file tree
Hide file tree
Showing 3 changed files with 181 additions and 91 deletions.
7 changes: 3 additions & 4 deletions app/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from config.denylist import USERNAME_DENYLIST
from grandchallenge.components.exceptions import PriorStepFailed
from grandchallenge.core.utils import strtobool
from grandchallenge.core.utils.markdown import ExtendTagClasses
from grandchallenge.core.utils.markdown import ExtendHTMLTagClasses

MEGABYTE = 1024 * 1024
GIGABYTE = 1024 * MEGABYTE
Expand Down Expand Up @@ -790,9 +790,8 @@ def get_private_ip():
"pymdownx.tasklist",
"pymdownx.tilde",
]

MARKDOWN_POST_PROCESSORS = [
ExtendTagClasses(
ExtendHTMLTagClasses(
{
"img": ["img-fluid"],
"blockquote": ["blockquote"],
Expand All @@ -804,7 +803,7 @@ def get_private_ip():
"thead": ["thead-light"],
"code": ["codehilite"],
}
)
),
]
MARKDOWNX_MARKDOWNIFY_FUNCTION = (
"grandchallenge.core.templatetags.bleach.md2html"
Expand Down
51 changes: 37 additions & 14 deletions app/grandchallenge/core/utils/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,53 @@
from markdown.treeprocessors import Treeprocessor


class ExtendTagClasses:
class BeautifulSoupWithCharEntities(BeautifulSoup):
"""
Soup generator that elegantly handles reserved HTML entity placeholders.
For instance, the soup HTMLparser replaces these (e.g. '<') into their
unicode equivalents (e.g. '<').
This messes up things if the HTML is decoded into a string again.
"""

def __init__(self, /, markup, features="html.parser", **kwargs):
markup = markup.replace("&", "&amp;")

super().__init__(markup=markup, features=features, **kwargs)

def decode(self, **kwargs):
# Prevent entity subsitution (e.g. "&" -> "&amp")
kwargs["formatter"] = None
return super().decode(**kwargs)


class ExtendHTMLTagClasses:
def __init__(self, tag_classes):
self.tag_class_dict = tag_classes
# Make extensions safe
self.tag_class_dict = {
t: [escape(c).strip() for c in classes]
for t, classes in tag_classes.items()
}

def __call__(self, html):
input_is_safe = isinstance(html, SafeString)

soup = BeautifulSoup(html, "html.parser")
for tag, classes in self.tag_class_dict.items():

# Make extensions safe
classes = [escape(c).strip() for c in classes]
soup = BeautifulSoupWithCharEntities(markup=html)

# Add extension to the class attribute
for element in soup.find_all(tag):
current_classes = element.get("class", [])
element["class"] = [*current_classes, *classes]
for element in soup.find_all(self.tag_class_dict.keys()):
classes = element.get("class", [])
for new_class in self.tag_class_dict[element.name]:
if new_class not in classes:
classes.append(new_class)
element["class"] = classes

new_html = str(soup)
new_markup = soup.decode()

if input_is_safe:
new_html = mark_safe(new_html)
new_markup = mark_safe(new_markup)

return mark_safe(new_html)
return new_markup


class LinkBlankTargetExtension(Extension):
Expand Down
214 changes: 141 additions & 73 deletions app/tests/core_tests/test_markdown.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,64 @@
import textwrap

import pytest
from django.conf import settings
from markdown import markdown
from django.utils.safestring import SafeString, mark_safe

from grandchallenge.core.templatetags.bleach import md2html
from grandchallenge.core.utils.markdown import ExtendHTMLTagClasses


@pytest.mark.parametrize(
"markdown_with_html, expected_output",
(
# (
# textwrap.dedent(
# """
# ![](test.png)
# > Quote Me
# Markdown | Less | Pretty
# --- | --- | ---
# *Still* | `renders` | **nicely**
# 1 | 2 | 3
# ```python
# def test_function():
# pass
# ```"""
# ),
# textwrap.dedent(
# """\
# <p><img class="img-fluid" src="test.png"/></p>
# <blockquote class="blockquote">
# <p>Quote Me</p>
# </blockquote>
# <table class="table table-hover table-borderless">
# <thead class="thead-light">
# <tr>
# <th>Markdown</th>
# <th>Less</th>
# <th>Pretty</th>
# </tr>
# </thead>
# <tbody>
# <tr>
# <td><em>Still</em></td>
# <td><code class="codehilite">renders</code></td>
# <td><strong>nicely</strong></td>
# </tr>
# <tr>
# <td>1</td>
# <td>2</td>
# <td>3</td>
# </tr>
# </tbody>
# </table>
# <div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">test_function</span><span class="p">():</span>
# <span class="k">pass</span>
# </pre></div>"""
# ),
# ),
(
textwrap.dedent(
"""
![](test.png)
> Quote Me
Markdown | Less | Pretty
--- | --- | ---
*Still* | `renders` | **nicely**
1 | 2 | 3
```python
def test_function():
pass
```"""
),
textwrap.dedent(
"""\
<p><img class="img-fluid" src="test.png"/></p>
<blockquote class="blockquote">
<p>Quote Me</p>
</blockquote>
<table class="table table-hover table-borderless">
<thead class="thead-light">
<tr>
<th>Markdown</th>
<th>Less</th>
<th>Pretty</th>
</tr>
</thead>
<tbody>
<tr>
<td><em>Still</em></td>
<td><code class="codehilite">renders</code></td>
<td><strong>nicely</strong></td>
</tr>
<tr>
<td>1</td>
<td>2</td>
<td>3</td>
</tr>
</tbody>
</table>
<div class="codehilite"><pre><span></span><span class="k">def</span> <span class="nf">test_function</span><span class="p">():</span>
<span class="k">pass</span>
</pre></div>"""
),
),
(
textwrap.dedent(
r"""
Expand Down Expand Up @@ -185,6 +188,10 @@ def test_function():
</ul>"""
),
),
(
"&lt;script&gt;alert(&quot;foo&quot;)&lt;/script&gt;",
"&lt;script&gt;alert(&quot;foo&quot;)&lt;/script&gt;",
),
),
)
def test_markdown_rendering(markdown_with_html, expected_output):
Expand All @@ -196,44 +203,105 @@ def test_markdown_rendering(markdown_with_html, expected_output):
"markdown_with_html, expected_output",
(
(
"""<img src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""",
"""<p><img class="img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img alt="" class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg" /></a></p>""",
textwrap.dedent(
"""\
<img src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)"""
),
textwrap.dedent(
"""\
<p><img class="img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg"/></a></p>"""
),
),
(
"""<img class="" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""",
"""<p><img class="img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img alt="" class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg" /></a></p>""",
textwrap.dedent(
"""\
<img class="" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)"""
),
textwrap.dedent(
"""\
<p><img class="img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg"/></a></p>"""
),
),
(
"""<img class="ml-2" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""",
"""<p><img class="ml-2 img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img alt="" class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg" /></a></p>""",
textwrap.dedent(
"""\
<img class="ml-2" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)"""
),
textwrap.dedent(
"""\
<p><img class="ml-2 img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg"/></a></p>"""
),
),
(
"""<img class="img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""",
"""<p><img class="img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img alt="" class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg" /></a></p>""",
textwrap.dedent(
"""\
<img class="img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)"""
),
textwrap.dedent(
"""\
<p><img class="img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg"/></a></p>"""
),
),
(
"""<img class="ml-2 img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""",
"""<p><img class="ml-2 img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img alt="" class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg" /></a></p>""",
textwrap.dedent(
"""\
<img class="ml-2 img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)"""
),
textwrap.dedent(
"""\
<p><img class="ml-2 img-fluid" src="https://rumc-gcorg-p-public.s3.amazonaws.com/i/2023/10/20/042179f0-ad8c-4c0b-af54-7e81ba389a90.jpeg"/>
<a href="https://google.com"><img class="img-fluid" src="http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg"/></a></p>"""
),
),
),
)
def test_setting_class_to_html_img_within_markdown(
markdown_with_html, expected_output
):
output = markdown(
text=markdown_with_html,
extensions=settings.MARKDOWNX_MARKDOWN_EXTENSIONS,
extension_configs=settings.MARKDOWNX_MARKDOWN_EXTENSION_CONFIGS,
)
output = md2html(markdown=markdown_with_html)

assert output == expected_output


@pytest.mark.parametrize(
"html, is_safe",
[
(
mark_safe("<div>Content</div>"),
True,
),
(
"<div>Content</div>",
False,
),
],
)
def test_extend_html_tag_classes_insecure_markup(html, is_safe):
tag_classes = {"div": ["new-class"]}

# Instantiate the class
extender = ExtendHTMLTagClasses(tag_classes)

# Process the HTML
result = extender(html)

# Check if the output matches the expected safety status
assert isinstance(result, SafeString) == is_safe


def test_extend_html_tag_classes_insecure_classes():
extender = ExtendHTMLTagClasses({"div": ['<script>alert("foo")</script>']})
output = extender("<div>Content</div>")
assert (
output
== '<div class="&lt;script&gt;alert(&quot;foo&quot;)&lt;/script&gt;">Content</div>'
)

0 comments on commit 42d41dc

Please sign in to comment.