From 2cf765ea583af515802da2dc6127ed40c0703741 Mon Sep 17 00:00:00 2001 From: Chris van Run Date: Wed, 18 Dec 2024 16:54:38 +0100 Subject: [PATCH 01/12] Ensure optional postprocessors are ran before the default postprocessors --- app/config/settings.py | 26 ++++++++++++++++--- .../core/templatetags/bleach.py | 2 +- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/app/config/settings.py b/app/config/settings.py index 8e06ba64a0..e6621d52e4 100644 --- a/app/config/settings.py +++ b/app/config/settings.py @@ -22,7 +22,7 @@ from config.denylist import USERNAME_DENYLIST from grandchallenge.components.exceptions import PriorStepFailed from grandchallenge.core.utils import strtobool -from grandchallenge.core.utils.markdown import BS4Extension +from grandchallenge.core.utils.markdown import ExtendTagClasses MEGABYTE = 1024 * 1024 GIGABYTE = 1024 * MEGABYTE @@ -789,13 +789,31 @@ def get_private_ip(): "pymdownx.magiclink", "pymdownx.tasklist", "pymdownx.tilde", - BS4Extension(), ] -MARKDOWN_POST_PROCESSORS = [] + +MARKDOWN_POST_PROCESSORS = [ + ExtendTagClasses( + { + "img": ["img-fluid"], + "blockquote": ["blockquote"], + "table": [ + "table", + "table-hover", + "table-borderless", + ], + "thead": ["thead-light"], + "code": ["codehilite"], + } + ) +] MARKDOWNX_MARKDOWNIFY_FUNCTION = ( "grandchallenge.core.templatetags.bleach.md2html" ) -MARKDOWNX_MARKDOWN_EXTENSION_CONFIGS = {} +MARKDOWNX_MARKDOWN_EXTENSION_CONFIGS = { + "markdown.extensions.codehilite": { + "wrapcode": False, + } +} MARKDOWNX_IMAGE_MAX_SIZE = {"size": (2000, 0), "quality": 90} MARKDOWNX_EDITOR_RESIZABLE = "False" diff --git a/app/grandchallenge/core/templatetags/bleach.py b/app/grandchallenge/core/templatetags/bleach.py index a0c9386e14..2bf85bfc4e 100644 --- a/app/grandchallenge/core/templatetags/bleach.py +++ b/app/grandchallenge/core/templatetags/bleach.py @@ -87,7 +87,7 @@ def md2html( post_processors = [*settings.MARKDOWN_POST_PROCESSORS] if process_youtube_tags: - post_processors.append(YOUTUBE_TAG_SUBSTITUTION) + post_processors = [YOUTUBE_TAG_SUBSTITUTION, *post_processors] for processor in post_processors: cleaned_html = processor(cleaned_html) From b51031250e947595748ab23be6f8e9a2feacb40f Mon Sep 17 00:00:00 2001 From: Chris van Run Date: Wed, 18 Dec 2024 16:55:13 +0100 Subject: [PATCH 02/12] Add new ExtendTagClasses post processor --- app/grandchallenge/core/utils/markdown.py | 67 +++++++---------------- 1 file changed, 20 insertions(+), 47 deletions(-) diff --git a/app/grandchallenge/core/utils/markdown.py b/app/grandchallenge/core/utils/markdown.py index 98b9d94176..c963831486 100644 --- a/app/grandchallenge/core/utils/markdown.py +++ b/app/grandchallenge/core/utils/markdown.py @@ -1,61 +1,34 @@ -from xml.etree import ElementTree - -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup +from django.utils.html import escape +from django.utils.safestring import SafeString, mark_safe from markdown import Extension from markdown.treeprocessors import Treeprocessor -class BS4Extension(Extension): - def extendMarkdown(self, md): # noqa: N802 - md.registerExtension(self) - md.treeprocessors.register(BS4Treeprocessor(md), "bs4_extension", 0) - - -class BS4Treeprocessor(Treeprocessor): - def run(self, root): - el_class_dict = { - "img": "img-fluid", - "blockquote": "blockquote", - "table": "table table-hover table-borderless", - "thead": "thead-light", - "code": "codehilite", - } - - for el in root.iter(): - if el.tag in el_class_dict: - self.set_css_class( - element=el, class_name=el_class_dict[el.tag] - ) - - for i, html_block in enumerate(self.md.htmlStash.rawHtmlBlocks): - bs4block = BeautifulSoup(html_block, "html.parser") +class ExtendTagClasses: + def __init__(self, tag_classes): + self.tag_class_dict = tag_classes - for tag, tag_class in el_class_dict.items(): - for el in bs4block.find_all(tag): - self.set_css_class(element=el, class_name=tag_class) - self.md.htmlStash.rawHtmlBlocks[i] = str(bs4block) + def __call__(self, html): + input_is_safe = isinstance(html, SafeString) - @staticmethod - def set_css_class(*, element, class_name): - if isinstance(element, ElementTree.Element): - current_class = element.attrib.get("class", "") + soup = BeautifulSoup(html, "html.parser") + for tag, classes in self.tag_class_dict.items(): - if class_name not in current_class: - new_class = f"{current_class} {class_name}".strip() - element.set("class", new_class) + # Make extensions safe + classes = [escape(c).strip() for c in classes] - elif isinstance(element, Tag): - if "class" not in element.attrs: - element.attrs["class"] = [] + # Add extension to the class attribute + for element in soup.find_all(tag): + current_classes = element.get("class", []) + element["class"] = [*current_classes, *classes] - current_class = element["class"] + new_html = str(soup) - for name in class_name.split(" "): - if class_name not in current_class: - current_class.append(name) + if input_is_safe: + new_html = mark_safe(new_html) - else: - raise TypeError("Unsupported element") + return mark_safe(new_html) class LinkBlankTargetExtension(Extension): From 72d340203f4118cc2da3651be42e73ab1506a2e2 Mon Sep 17 00:00:00 2001 From: Chris van Run Date: Wed, 18 Dec 2024 16:55:23 +0100 Subject: [PATCH 03/12] First fixes to markdown tests --- app/tests/core_tests/test_markdown.py | 120 +++++++++++--------------- 1 file changed, 52 insertions(+), 68 deletions(-) diff --git a/app/tests/core_tests/test_markdown.py b/app/tests/core_tests/test_markdown.py index 3dcf485759..460ddef207 100644 --- a/app/tests/core_tests/test_markdown.py +++ b/app/tests/core_tests/test_markdown.py @@ -5,61 +5,57 @@ from markdown import markdown from grandchallenge.core.templatetags.bleach import md2html -from grandchallenge.core.utils.markdown import BS4Treeprocessor @pytest.mark.parametrize( "markdown_with_html, expected_output", ( - ( - textwrap.dedent( - """ - ![](test.png) - - > Quote Me - - Markdown | Less | Pretty - --- | --- | --- - *Still* | `renders` | **nicely** - 1 | 2 | 3 - - ```python - def test_function(): - pass - ```""" - ), - textwrap.dedent( - """\ -

-
-

Quote Me

-
- - - - - - - - - - - - - - - - - - - - -
MarkdownLessPretty
Stillrendersnicely
123
-
def test_function():
-                    pass
-                
""" - ), - ), + # ( + # textwrap.dedent( + # """ + # ![](test.png) + # > Quote Me + # Markdown | Less | Pretty + # --- | --- | --- + # *Still* | `renders` | **nicely** + # 1 | 2 | 3 + # ```python + # def test_function(): + # pass + # ```""" + # ), + # textwrap.dedent( + # """\ + #

+ #
+ #

Quote Me

+ #
+ # + # + # + # + # + # + # + # + # + # + # + # + # + # + # + # + # + # + # + # + #
MarkdownLessPretty
Stillrendersnicely
123
+ #
def test_function():
+        #             pass
+        #         
""" + # ), + # ), ( textwrap.dedent( r""" @@ -123,10 +119,10 @@ def test_function(): ), textwrap.dedent( """\ -

-

-

-

+

+

+

+

Quote Me

@@ -136,7 +132,6 @@ def test_function():

Quote Me Existing Class

- @@ -167,7 +162,6 @@ def test_function():
- @@ -177,14 +171,11 @@ def test_function():
- -
def test_function():
+                
def test_function():
                     pass
-                
- +
no class
existing class
-

Delete me

CH3CH2OH texta subscript

@@ -246,10 +237,3 @@ def test_setting_class_to_html_img_within_markdown( ) assert output == expected_output - - -def test_tree_processor_set_css_class_type_error(): - with pytest.raises(TypeError): - BS4Treeprocessor.set_css_class( - element="element", class_name="img-fluid" - ) From 42d41dcba3a24e2f0731ea1a421798922b5626cd Mon Sep 17 00:00:00 2001 From: Chris van Run Date: Fri, 20 Dec 2024 15:47:07 +0100 Subject: [PATCH 04/12] Add HTML postprocessor to extend classes --- app/config/settings.py | 7 +- app/grandchallenge/core/utils/markdown.py | 51 ++++-- app/tests/core_tests/test_markdown.py | 214 ++++++++++++++-------- 3 files changed, 181 insertions(+), 91 deletions(-) diff --git a/app/config/settings.py b/app/config/settings.py index e6621d52e4..7303c51c38 100644 --- a/app/config/settings.py +++ b/app/config/settings.py @@ -22,7 +22,7 @@ from config.denylist import USERNAME_DENYLIST from grandchallenge.components.exceptions import PriorStepFailed from grandchallenge.core.utils import strtobool -from grandchallenge.core.utils.markdown import ExtendTagClasses +from grandchallenge.core.utils.markdown import ExtendHTMLTagClasses MEGABYTE = 1024 * 1024 GIGABYTE = 1024 * MEGABYTE @@ -790,9 +790,8 @@ def get_private_ip(): "pymdownx.tasklist", "pymdownx.tilde", ] - MARKDOWN_POST_PROCESSORS = [ - ExtendTagClasses( + ExtendHTMLTagClasses( { "img": ["img-fluid"], "blockquote": ["blockquote"], @@ -804,7 +803,7 @@ def get_private_ip(): "thead": ["thead-light"], "code": ["codehilite"], } - ) + ), ] MARKDOWNX_MARKDOWNIFY_FUNCTION = ( "grandchallenge.core.templatetags.bleach.md2html" diff --git a/app/grandchallenge/core/utils/markdown.py b/app/grandchallenge/core/utils/markdown.py index c963831486..4427529ef1 100644 --- a/app/grandchallenge/core/utils/markdown.py +++ b/app/grandchallenge/core/utils/markdown.py @@ -5,30 +5,53 @@ from markdown.treeprocessors import Treeprocessor -class ExtendTagClasses: +class BeautifulSoupWithCharEntities(BeautifulSoup): + """ + Soup generator that elegantly handles reserved HTML entity placeholders. + + For instance, the soup HTMLparser replaces these (e.g. '<') into their + unicode equivalents (e.g. '<'). + + This messes up things if the HTML is decoded into a string again. + """ + + def __init__(self, /, markup, features="html.parser", **kwargs): + markup = markup.replace("&", "&") + + super().__init__(markup=markup, features=features, **kwargs) + + def decode(self, **kwargs): + # Prevent entity subsitution (e.g. "&" -> "&") + kwargs["formatter"] = None + return super().decode(**kwargs) + + +class ExtendHTMLTagClasses: def __init__(self, tag_classes): - self.tag_class_dict = tag_classes + # Make extensions safe + self.tag_class_dict = { + t: [escape(c).strip() for c in classes] + for t, classes in tag_classes.items() + } def __call__(self, html): input_is_safe = isinstance(html, SafeString) - soup = BeautifulSoup(html, "html.parser") - for tag, classes in self.tag_class_dict.items(): - - # Make extensions safe - classes = [escape(c).strip() for c in classes] + soup = BeautifulSoupWithCharEntities(markup=html) - # Add extension to the class attribute - for element in soup.find_all(tag): - current_classes = element.get("class", []) - element["class"] = [*current_classes, *classes] + for element in soup.find_all(self.tag_class_dict.keys()): + classes = element.get("class", []) + for new_class in self.tag_class_dict[element.name]: + if new_class not in classes: + classes.append(new_class) + element["class"] = classes - new_html = str(soup) + new_markup = soup.decode() if input_is_safe: - new_html = mark_safe(new_html) + new_markup = mark_safe(new_markup) - return mark_safe(new_html) + return new_markup class LinkBlankTargetExtension(Extension): diff --git a/app/tests/core_tests/test_markdown.py b/app/tests/core_tests/test_markdown.py index 460ddef207..2dedc5df6b 100644 --- a/app/tests/core_tests/test_markdown.py +++ b/app/tests/core_tests/test_markdown.py @@ -1,61 +1,64 @@ import textwrap import pytest -from django.conf import settings -from markdown import markdown +from django.utils.safestring import SafeString, mark_safe from grandchallenge.core.templatetags.bleach import md2html +from grandchallenge.core.utils.markdown import ExtendHTMLTagClasses @pytest.mark.parametrize( "markdown_with_html, expected_output", ( - # ( - # textwrap.dedent( - # """ - # ![](test.png) - # > Quote Me - # Markdown | Less | Pretty - # --- | --- | --- - # *Still* | `renders` | **nicely** - # 1 | 2 | 3 - # ```python - # def test_function(): - # pass - # ```""" - # ), - # textwrap.dedent( - # """\ - #

- #
- #

Quote Me

- #
- # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - # - #
MarkdownLessPretty
Stillrendersnicely
123
- #
def test_function():
-        #             pass
-        #         
""" - # ), - # ), + ( + textwrap.dedent( + """ + ![](test.png) + + > Quote Me + + Markdown | Less | Pretty + --- | --- | --- + *Still* | `renders` | **nicely** + 1 | 2 | 3 + + ```python + def test_function(): + pass + ```""" + ), + textwrap.dedent( + """\ +

+
+

Quote Me

+
+ + + + + + + + + + + + + + + + + + + + +
MarkdownLessPretty
Stillrendersnicely
123
+
def test_function():
+                    pass
+                
""" + ), + ), ( textwrap.dedent( r""" @@ -185,6 +188,10 @@ def test_function(): """ ), ), + ( + "<script>alert("foo")</script>", + "<script>alert("foo")</script>", + ), ), ) def test_markdown_rendering(markdown_with_html, expected_output): @@ -196,44 +203,105 @@ def test_markdown_rendering(markdown_with_html, expected_output): "markdown_with_html, expected_output", ( ( - """ - [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""", - """

-

""", + textwrap.dedent( + """\ + + [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""" + ), + textwrap.dedent( + """\ +

+

""" + ), ), ( - """ - [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""", - """

-

""", + textwrap.dedent( + """\ + + [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""" + ), + textwrap.dedent( + """\ +

+

""" + ), ), ( - """ - [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""", - """

-

""", + textwrap.dedent( + """\ + + [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""" + ), + textwrap.dedent( + """\ +

+

""" + ), ), ( - """ - [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""", - """

-

""", + textwrap.dedent( + """\ + + [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""" + ), + textwrap.dedent( + """\ +

+

""" + ), ), ( - """ - [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""", - """

-

""", + textwrap.dedent( + """\ + + [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""" + ), + textwrap.dedent( + """\ +

+

""" + ), ), ), ) def test_setting_class_to_html_img_within_markdown( markdown_with_html, expected_output ): - output = markdown( - text=markdown_with_html, - extensions=settings.MARKDOWNX_MARKDOWN_EXTENSIONS, - extension_configs=settings.MARKDOWNX_MARKDOWN_EXTENSION_CONFIGS, - ) + output = md2html(markdown=markdown_with_html) assert output == expected_output + + +@pytest.mark.parametrize( + "html, is_safe", + [ + ( + mark_safe("
Content
"), + True, + ), + ( + "
Content
", + False, + ), + ], +) +def test_extend_html_tag_classes_insecure_markup(html, is_safe): + tag_classes = {"div": ["new-class"]} + + # Instantiate the class + extender = ExtendHTMLTagClasses(tag_classes) + + # Process the HTML + result = extender(html) + + # Check if the output matches the expected safety status + assert isinstance(result, SafeString) == is_safe + + +def test_extend_html_tag_classes_insecure_classes(): + extender = ExtendHTMLTagClasses({"div": ['']}) + output = extender("
Content
") + assert ( + output + == '
Content
' + ) From 237838495a78120be3a64f1bc56026c8f748fa17 Mon Sep 17 00:00:00 2001 From: Chris van Run Date: Fri, 20 Dec 2024 16:24:20 +0100 Subject: [PATCH 05/12] Refactor test --- app/tests/core_tests/test_markdown.py | 137 +++++++++----------------- 1 file changed, 45 insertions(+), 92 deletions(-) diff --git a/app/tests/core_tests/test_markdown.py b/app/tests/core_tests/test_markdown.py index 2dedc5df6b..75e887f1c8 100644 --- a/app/tests/core_tests/test_markdown.py +++ b/app/tests/core_tests/test_markdown.py @@ -190,118 +190,71 @@ def test_function(): ), ( "<script>alert("foo")</script>", - "<script>alert("foo")</script>", - ), - ), -) -def test_markdown_rendering(markdown_with_html, expected_output): - output = md2html(markdown=markdown_with_html) - assert output == expected_output - - -@pytest.mark.parametrize( - "markdown_with_html, expected_output", - ( - ( - textwrap.dedent( - """\ - - [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""" - ), - textwrap.dedent( - """\ -

-

""" - ), - ), - ( - textwrap.dedent( - """\ - - [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""" - ), - textwrap.dedent( - """\ -

-

""" - ), - ), - ( - textwrap.dedent( - """\ - - [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""" - ), - textwrap.dedent( - """\ -

-

""" - ), + "

<script>alert("foo")</script>

", ), ( - textwrap.dedent( - """\ - - [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""" - ), - textwrap.dedent( - """\ -

-

""" - ), - ), - ( - textwrap.dedent( - """\ - - [![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)""" - ), - textwrap.dedent( - """\ -

-

""" - ), + "[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)", + '

', ), ), ) -def test_setting_class_to_html_img_within_markdown( - markdown_with_html, expected_output -): +def test_markdown_rendering(markdown_with_html, expected_output): output = md2html(markdown=markdown_with_html) - assert output == expected_output @pytest.mark.parametrize( - "html, is_safe", + "html, tag_classes, expected_output, is_safe", [ - ( + ( # Safe input mark_safe("
Content
"), + {}, + "
Content
", True, ), - ( + ( # Unsafe input + "
Content
", + {}, "
Content
", False, ), + ( # Escaped classes + mark_safe("
Content
"), + {"div": ['']}, + '
Content
', + True, + ), + ( # Empty class + '
Content
', + {"div": ["foo"]}, + '
Content
', + False, + ), + ( # Existing class + '
Content
', + {"div": ["foo"]}, + '
Content
', + False, + ), + ( # Extension class already present + '
Content
', + {"div": ["foo"]}, + '
Content
', + False, + ), + ( # Existing class + extension class + '
Content
', + {"div": ["foo"]}, + '
Content
', + False, + ), ], ) -def test_extend_html_tag_classes_insecure_markup(html, is_safe): - tag_classes = {"div": ["new-class"]} - - # Instantiate the class +def test_extend_html_tag_classes(html, tag_classes, expected_output, is_safe): extender = ExtendHTMLTagClasses(tag_classes) + output = extender(html) - # Process the HTML - result = extender(html) + assert output == expected_output # Check if the output matches the expected safety status - assert isinstance(result, SafeString) == is_safe - - -def test_extend_html_tag_classes_insecure_classes(): - extender = ExtendHTMLTagClasses({"div": ['']}) - output = extender("
Content
") - assert ( - output - == '
Content
' - ) + assert isinstance(output, SafeString) == is_safe From 5210a63e0007861957cddf1d8c546d3bd70018e9 Mon Sep 17 00:00:00 2001 From: Chris van Run Date: Fri, 20 Dec 2024 16:24:30 +0100 Subject: [PATCH 06/12] Handle empty tags --- app/grandchallenge/core/utils/markdown.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/app/grandchallenge/core/utils/markdown.py b/app/grandchallenge/core/utils/markdown.py index 4427529ef1..7f820de2ea 100644 --- a/app/grandchallenge/core/utils/markdown.py +++ b/app/grandchallenge/core/utils/markdown.py @@ -29,7 +29,7 @@ def decode(self, **kwargs): class ExtendHTMLTagClasses: def __init__(self, tag_classes): # Make extensions safe - self.tag_class_dict = { + self.clean_tag_classes = { t: [escape(c).strip() for c in classes] for t, classes in tag_classes.items() } @@ -39,12 +39,14 @@ def __call__(self, html): soup = BeautifulSoupWithCharEntities(markup=html) - for element in soup.find_all(self.tag_class_dict.keys()): - classes = element.get("class", []) - for new_class in self.tag_class_dict[element.name]: - if new_class not in classes: - classes.append(new_class) - element["class"] = classes + tags = [*self.clean_tag_classes.keys()] + if tags: + for element in soup.find_all(tags): + classes = element.get("class", []) + for new_class in self.clean_tag_classes[element.name]: + if new_class not in classes: + classes.append(new_class) + element["class"] = classes new_markup = soup.decode() From edae00c7aa6d405bdabb25ddb326c843bf84e4cd Mon Sep 17 00:00:00 2001 From: Chris van Run Date: Fri, 20 Dec 2024 16:46:46 +0100 Subject: [PATCH 07/12] Fix youtube tag subsitution test --- .../core_tests/test_tag_substitutions.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/app/tests/core_tests/test_tag_substitutions.py b/app/tests/core_tests/test_tag_substitutions.py index 9d76d73287..3ae6208ee9 100644 --- a/app/tests/core_tests/test_tag_substitutions.py +++ b/app/tests/core_tests/test_tag_substitutions.py @@ -1,3 +1,5 @@ +import textwrap + import pytest from django.utils.safestring import SafeString, mark_safe @@ -131,16 +133,13 @@ def test_no_change_with_no_tag_or_arg_match(content): assert s == content -EXPECTED_YOUTUBE_EMBED = """

- -
-

""" +EXPECTED_YOUTUBE_EMBED = textwrap.dedent( + """\ +

+ +
+

""" +) @pytest.mark.parametrize( From f7286a9703ab3c8106003c33271d13da1fb2720d Mon Sep 17 00:00:00 2001 From: James Meakin <12661555+jmsmkn@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:27:43 +0100 Subject: [PATCH 08/12] Revert to using markdown processor --- app/config/settings.py | 19 +---- .../core/templatetags/bleach.py | 2 +- app/grandchallenge/core/utils/markdown.py | 65 +++++----------- app/tests/core_tests/test_markdown.py | 75 +++++++------------ 4 files changed, 52 insertions(+), 109 deletions(-) diff --git a/app/config/settings.py b/app/config/settings.py index 7303c51c38..7310eeba90 100644 --- a/app/config/settings.py +++ b/app/config/settings.py @@ -22,7 +22,7 @@ from config.denylist import USERNAME_DENYLIST from grandchallenge.components.exceptions import PriorStepFailed from grandchallenge.core.utils import strtobool -from grandchallenge.core.utils.markdown import ExtendHTMLTagClasses +from grandchallenge.core.utils.markdown import BS4Extension MEGABYTE = 1024 * 1024 GIGABYTE = 1024 * MEGABYTE @@ -789,22 +789,9 @@ def get_private_ip(): "pymdownx.magiclink", "pymdownx.tasklist", "pymdownx.tilde", + BS4Extension(), ] -MARKDOWN_POST_PROCESSORS = [ - ExtendHTMLTagClasses( - { - "img": ["img-fluid"], - "blockquote": ["blockquote"], - "table": [ - "table", - "table-hover", - "table-borderless", - ], - "thead": ["thead-light"], - "code": ["codehilite"], - } - ), -] +MARKDOWN_POST_PROCESSORS = [] MARKDOWNX_MARKDOWNIFY_FUNCTION = ( "grandchallenge.core.templatetags.bleach.md2html" ) diff --git a/app/grandchallenge/core/templatetags/bleach.py b/app/grandchallenge/core/templatetags/bleach.py index 2bf85bfc4e..a0c9386e14 100644 --- a/app/grandchallenge/core/templatetags/bleach.py +++ b/app/grandchallenge/core/templatetags/bleach.py @@ -87,7 +87,7 @@ def md2html( post_processors = [*settings.MARKDOWN_POST_PROCESSORS] if process_youtube_tags: - post_processors = [YOUTUBE_TAG_SUBSTITUTION, *post_processors] + post_processors.append(YOUTUBE_TAG_SUBSTITUTION) for processor in post_processors: cleaned_html = processor(cleaned_html) diff --git a/app/grandchallenge/core/utils/markdown.py b/app/grandchallenge/core/utils/markdown.py index 7f820de2ea..28df5d6c11 100644 --- a/app/grandchallenge/core/utils/markdown.py +++ b/app/grandchallenge/core/utils/markdown.py @@ -1,59 +1,34 @@ from bs4 import BeautifulSoup -from django.utils.html import escape -from django.utils.safestring import SafeString, mark_safe from markdown import Extension +from markdown.postprocessors import Postprocessor from markdown.treeprocessors import Treeprocessor -class BeautifulSoupWithCharEntities(BeautifulSoup): - """ - Soup generator that elegantly handles reserved HTML entity placeholders. - - For instance, the soup HTMLparser replaces these (e.g. '<') into their - unicode equivalents (e.g. '<'). - - This messes up things if the HTML is decoded into a string again. - """ - - def __init__(self, /, markup, features="html.parser", **kwargs): - markup = markup.replace("&", "&") - - super().__init__(markup=markup, features=features, **kwargs) +class BS4Extension(Extension): + def extendMarkdown(self, md): # noqa: N802 + md.postprocessors.register(BS4Postprocessor(md), "bs4_extension", 0) - def decode(self, **kwargs): - # Prevent entity subsitution (e.g. "&" -> "&") - kwargs["formatter"] = None - return super().decode(**kwargs) +class BS4Postprocessor(Postprocessor): + def run(self, text): + soup = BeautifulSoup(text, "html.parser") -class ExtendHTMLTagClasses: - def __init__(self, tag_classes): - # Make extensions safe - self.clean_tag_classes = { - t: [escape(c).strip() for c in classes] - for t, classes in tag_classes.items() + class_map = { + "img": ["img-fluid"], + "blockquote": ["blockquote"], + "table": ["table", "table-hover", "table-borderless"], + "thead": ["thead-light"], + "code": ["codehilite"], } - def __call__(self, html): - input_is_safe = isinstance(html, SafeString) - - soup = BeautifulSoupWithCharEntities(markup=html) - - tags = [*self.clean_tag_classes.keys()] - if tags: - for element in soup.find_all(tags): - classes = element.get("class", []) - for new_class in self.clean_tag_classes[element.name]: - if new_class not in classes: - classes.append(new_class) - element["class"] = classes - - new_markup = soup.decode() - - if input_is_safe: - new_markup = mark_safe(new_markup) + for element in soup.find_all([*class_map.keys()]): + classes = element.get("class", []) + for new_class in class_map[element.name]: + if new_class not in classes: + classes.append(new_class) + element["class"] = classes - return new_markup + return str(soup) class LinkBlankTargetExtension(Extension): diff --git a/app/tests/core_tests/test_markdown.py b/app/tests/core_tests/test_markdown.py index 91025fbde1..40e3bbc238 100644 --- a/app/tests/core_tests/test_markdown.py +++ b/app/tests/core_tests/test_markdown.py @@ -1,10 +1,9 @@ import textwrap import pytest -from django.utils.safestring import SafeString, mark_safe +from markdown import markdown from grandchallenge.core.templatetags.bleach import md2html -from grandchallenge.core.utils.markdown import ExtendHTMLTagClasses @pytest.mark.parametrize( @@ -29,7 +28,7 @@ def test_function(): ), textwrap.dedent( """\ -

+

Quote Me

@@ -122,10 +121,10 @@ def test_function(): ), textwrap.dedent( """\ -

-

-

-

+

+

+

+

Quote Me

@@ -184,17 +183,17 @@ def test_function(): texta subscript

""" ), ), ( "<script>alert("foo")</script>", - "

<script>alert("foo")</script>

", + '

<script>alert("foo")</script>

', ), ( "[![](http://minio.localhost:9000/grand-challenge-public/i/2024/08/06/77c8d999-c22b-4983-8558-8e1fa364cd2c.jpg)](https://google.com)", - '

', + '

', ), ), ) @@ -204,57 +203,39 @@ def test_markdown_rendering(markdown_with_html, expected_output): @pytest.mark.parametrize( - "html, tag_classes, expected_output, is_safe", + "html, expected_output", [ - ( # Safe input - mark_safe("
Content
"), - {}, + ( # Unaffected element "
Content
", - True, - ), - ( # Unsafe input - "
Content
", - {}, "
Content
", - False, ), - ( # Escaped classes - mark_safe("
Content
"), - {"div": ['']}, - '
Content
', - True, + ( # With Markdown + "> Content", + '
\n

Content

\n
', ), ( # Empty class - '
Content
', - {"div": ["foo"]}, - '
Content
', - False, + '
Content
', + '
Content
', ), ( # Existing class - '
Content
', - {"div": ["foo"]}, - '
Content
', - False, + '
Content
', + '
Content
', ), ( # Extension class already present - '
Content
', - {"div": ["foo"]}, - '
Content
', - False, + '
Content
', + '
Content
', ), ( # Existing class + extension class - '
Content
', - {"div": ["foo"]}, - '
Content
', - False, + '
Content
', + '
Content
', ), ], ) -def test_extend_html_tag_classes(html, tag_classes, expected_output, is_safe): - extender = ExtendHTMLTagClasses(tag_classes) - output = extender(html) +def test_extend_html_tag_classes(html, expected_output, settings): + output = markdown( + text=html, + extensions=settings.MARKDOWNX_MARKDOWN_EXTENSIONS, + extension_configs=settings.MARKDOWNX_MARKDOWN_EXTENSION_CONFIGS, + ) assert output == expected_output - - # Check if the output matches the expected safety status - assert isinstance(output, SafeString) == is_safe From a08d295c1a37123053204321060b213b51856a49 Mon Sep 17 00:00:00 2001 From: James Meakin <12661555+jmsmkn@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:35:36 +0100 Subject: [PATCH 09/12] Revert expected text --- .../core_tests/test_tag_substitutions.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/app/tests/core_tests/test_tag_substitutions.py b/app/tests/core_tests/test_tag_substitutions.py index 3ae6208ee9..9d76d73287 100644 --- a/app/tests/core_tests/test_tag_substitutions.py +++ b/app/tests/core_tests/test_tag_substitutions.py @@ -1,5 +1,3 @@ -import textwrap - import pytest from django.utils.safestring import SafeString, mark_safe @@ -133,13 +131,16 @@ def test_no_change_with_no_tag_or_arg_match(content): assert s == content -EXPECTED_YOUTUBE_EMBED = textwrap.dedent( - """\ -

- -
-

""" -) +EXPECTED_YOUTUBE_EMBED = """

+ +
+

""" @pytest.mark.parametrize( From 910c14594be5dc25fb345658b4bc20bb2fc2a08b Mon Sep 17 00:00:00 2001 From: James Meakin <12661555+jmsmkn@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:41:55 +0100 Subject: [PATCH 10/12] Add a test for mixed content --- app/tests/core_tests/test_markdown.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/app/tests/core_tests/test_markdown.py b/app/tests/core_tests/test_markdown.py index 40e3bbc238..c04275ba09 100644 --- a/app/tests/core_tests/test_markdown.py +++ b/app/tests/core_tests/test_markdown.py @@ -213,6 +213,11 @@ def test_markdown_rendering(markdown_with_html, expected_output): "> Content", '
\n

Content

\n
', ), + ( # Mixed content + "> Markdown Content\n" + "
HTML Content
", + '
\n

Markdown Content

\n
\n
HTML Content
', + ), ( # Empty class '
Content
', '
Content
', From 1f6a64b791b50d6a2192a70121ef36d34bf3b6a6 Mon Sep 17 00:00:00 2001 From: James Meakin <12661555+jmsmkn@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:52:34 +0100 Subject: [PATCH 11/12] Add spacing to code blocks --- app/grandchallenge/core/static/css/base.scss | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/app/grandchallenge/core/static/css/base.scss b/app/grandchallenge/core/static/css/base.scss index f9a8e6bd46..c8fe073f46 100644 --- a/app/grandchallenge/core/static/css/base.scss +++ b/app/grandchallenge/core/static/css/base.scss @@ -50,3 +50,7 @@ blockquote { border-left: $spacer * .25 solid $primary; color: $primary; } + +div.codehilite { + margin-bottom: $paragraph-margin-bottom; +} From 3ea67df0da46755d3f405ef2c24e366e7a3bb18f Mon Sep 17 00:00:00 2001 From: James Meakin <12661555+jmsmkn@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:53:22 +0100 Subject: [PATCH 12/12] Fix test --- app/tests/pages_tests/test_pages.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/tests/pages_tests/test_pages.py b/app/tests/pages_tests/test_pages.py index 7b390ae788..09931fdae7 100644 --- a/app/tests/pages_tests/test_pages.py +++ b/app/tests/pages_tests/test_pages.py @@ -107,8 +107,8 @@ def test_page_create(client, two_challenge_sets): response = get_view_for_user(url=response.url, client=client) assert response.status_code == 200 assert ( - '

HELLO WORLD

' - in str(response.content) + '

HELLO WORLDΒΆ

' + in response.content.decode("utf-8") ) # Check that it was created in the correct challenge response = get_view_for_user(