-
Notifications
You must be signed in to change notification settings - Fork 38
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
* Issue #87 Make sanitization happen in Post.htmlize() * Made utils/text. Moved sanitize functions, constants, and tests there. * Post.htmlize() accepts and applies a sanitization param * Export passes along the level to to Post. * Incorporate PR feedback - default sanitize to none * Issue #87 Make sanitization happen in Post.htmlize() Clean up imports and comments in tests. * Issue #87 Make sanitization happen in Post.htmlize() * Add docstrings to the top * move constants to top * rename _get_ methods to get_ - they're not hidden inside a class, may as well make them available directly to those that want it * Fix test; tweak docs
- Loading branch information
1 parent
b89caa2
commit 88d5d7c
Showing
5 changed files
with
201 additions
and
136 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
# baleen.utils.text | ||
# Utility functions for Baleen | ||
# | ||
# Author: Benjamin Bengfort <[email protected]> | ||
# Created: Sat Jun 03 18:48:00 2017 -0400 | ||
# | ||
# Copyright (C) 2017 Bengfort.com | ||
# For license information, see LICENSE.txt | ||
# | ||
# ID: text.py [caaaaca] [email protected] $ | ||
|
||
""" | ||
Text-related Utility functions for Baleenc | ||
""" | ||
|
||
########################################################################## | ||
## Imports | ||
########################################################################## | ||
|
||
import bleach | ||
from readability.readability import Document | ||
|
||
########################################################################## | ||
## Constants | ||
########################################################################## | ||
|
||
RAW = 'raw' | ||
SAFE = 'safe' | ||
TEXT = 'text' | ||
SANITIZE_LEVELS = (RAW, SAFE, TEXT) | ||
|
||
|
||
def get_raw_html(html): | ||
""" | ||
:param html: html content | ||
:return: the unmodified html | ||
""" | ||
return html | ||
|
||
|
||
def get_safe_html(html): | ||
""" | ||
Applies Readability's sanitize() method to content. | ||
:param html: the content to sanitize | ||
:return: the body of the html content minus html tags | ||
""" | ||
if html is None: | ||
return None | ||
return Document(html).summary() | ||
|
||
|
||
def get_text_from_html(html): | ||
""" | ||
Applies the 'safe' level of sanitization, removes newlines, | ||
and converts the html entity for ampersand into the ampersand character. | ||
:param html: the content to sanitize | ||
:return: sanitized content | ||
""" | ||
if html is None: | ||
return html | ||
|
||
text = get_safe_html(html) | ||
text = bleach.clean(text, tags=[], strip=True) | ||
text = text.strip() | ||
text = text.replace("\n", "") | ||
text = text.replace("&", "&") | ||
return text | ||
|
||
|
||
def sanitize_html(html, level): | ||
""" | ||
Return a sanitized version of html content | ||
:param html: the content to sanitized | ||
:param level: the type of sanitization - one of ['raw', 'safe', 'text', None] | ||
:return: sanitized content | ||
""" | ||
if level == SAFE: | ||
return get_safe_html(html) | ||
elif level == RAW: | ||
return get_raw_html(html) | ||
elif level == TEXT: | ||
return get_text_from_html(html) | ||
elif level is None: | ||
return html | ||
|
||
raise ValueError( | ||
"{level} is not a supported sanitize_html level.".format( | ||
level=level | ||
) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# test.utils_tests.test_text | ||
# Testing for the text helpers library. | ||
# | ||
# Author: Benjamin Bengfort <[email protected]> | ||
# Created: Sat Jun 03 18:48:00 2017 -0400 | ||
# | ||
# Copyright (C) 2017 Bengfort.com | ||
# For license information, see LICENSE.txt | ||
# | ||
# ID: test_text.py [df0c71b] [email protected] $ | ||
|
||
""" | ||
Testing for the text helpers library. | ||
""" | ||
|
||
########################################################################## | ||
## Imports | ||
########################################################################## | ||
|
||
import unittest | ||
|
||
from baleen.exceptions import ExportError | ||
from baleen.utils.text import sanitize_html, RAW, SAFE, TEXT | ||
|
||
|
||
class SanitizeHtmlTests(unittest.TestCase): | ||
""" Tests the exporter's HTML sanitize methods """ | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
cls.sample_html = ('<html>' | ||
'<head><script>javascript here</script></head>' | ||
'<body><b>body &\n mind</b></body>' | ||
'</html>') | ||
|
||
@classmethod | ||
def tearDownClass(self): | ||
""" | ||
Drop the mongomock connection | ||
""" | ||
pass | ||
|
||
def test_sanitize_requires_a_valid_level(self): | ||
""" Sanitize_html requires a supported level """ | ||
with self.assertRaises(ValueError): | ||
sanitize_html(self.sample_html, "bogus") | ||
|
||
def test_sanitize_returns_input_for_level_none(self): | ||
""" sanitize_html returns unmodified input for level None """ | ||
self.assertEqual(sanitize_html(self.sample_html, None), self.sample_html) | ||
|
||
def test_sanitize_raw(self): | ||
""" Sanitize level raw returns the content as submitted """ | ||
self.assertEqual(sanitize_html(self.sample_html, RAW), self.sample_html) | ||
|
||
def test_sanitize_raw_handles_none(self): | ||
""" | ||
Sanitize level raw accepts None gracefully | ||
""" | ||
self.assertEqual(sanitize_html(None, RAW), None) | ||
|
||
def test_sanitize_safe(self): | ||
""" Sanitize level safe applies Readability and returns the body """ | ||
|
||
# Give Readability a simpler HTML sample to keep its parse strategy simple | ||
sample_html = ('<html>' | ||
'<head><script>javascript here</script></head>' | ||
'<body>body</body>' | ||
'</html>') | ||
expected = '<body id="readabilityBody">body</body>' | ||
self.assertEqual(sanitize_html(sample_html, SAFE), expected) | ||
|
||
def test_sanitize_safe_handles_none(self): | ||
""" | ||
Sanitize level safe accepts None gracefully | ||
""" | ||
self.assertEqual(sanitize_html(None, SAFE), None) | ||
|
||
def test_sanitize_text(self): | ||
""" | ||
Sanitize level text strips HTML tags, removes newlines, | ||
and converts the html entity ampersand into an ampersand character | ||
""" | ||
expected = 'body & mind' | ||
self.assertEqual(sanitize_html(self.sample_html, TEXT), expected) | ||
|
||
def test_sanitize_text_handles_none(self): | ||
""" | ||
Sanitize level text accepts None gracefully | ||
""" | ||
self.assertEqual(sanitize_html(None, TEXT), None) |