From 88d5d7c57e7864bc50738726d5331b56c2c658df Mon Sep 17 00:00:00 2001 From: Janet Riley Date: Sun, 11 Jun 2017 21:21:00 -0400 Subject: [PATCH] Issue #87 Make sanitization happen in Post.htmlize() (#88) * Issue #87 Make sanitization happen in Post.htmlize() * Made utils/text. Moved sanitize functions, constants, and tests there. * Post.htmlize() accepts and applies a sanitization param * Export passes along the level to to Post. * Incorporate PR feedback - default sanitize to none * Issue #87 Make sanitization happen in Post.htmlize() Clean up imports and comments in tests. * Issue #87 Make sanitization happen in Post.htmlize() * Add docstrings to the top * move constants to top * rename _get_ methods to get_ - they're not hidden inside a class, may as well make them available directly to those that want it * Fix test; tweak docs --- baleen/export.py | 79 ++++------------------------- baleen/models.py | 14 ++++-- baleen/utils/text.py | 90 +++++++++++++++++++++++++++++++++ tests/test_export.py | 63 +---------------------- tests/utils_tests/test_text.py | 91 ++++++++++++++++++++++++++++++++++ 5 files changed, 201 insertions(+), 136 deletions(-) create mode 100644 baleen/utils/text.py create mode 100644 tests/utils_tests/test_text.py diff --git a/baleen/export.py b/baleen/export.py index 46fc353..0d25948 100644 --- a/baleen/export.py +++ b/baleen/export.py @@ -17,30 +17,22 @@ ## Imports ########################################################################## -import os import codecs -import bleach - -from enum import Enum +import os +from collections import Counter from datetime import datetime -from baleen.models import Feed, Post +from enum import Enum +from operator import itemgetter from tqdm import tqdm -from collections import Counter -from operator import itemgetter -from baleen.exceptions import ExportError -from readability.readability import Document -########################################################################## -## Module Constants -########################################################################## +from baleen.exceptions import ExportError +from baleen.models import Feed, Post +from baleen.utils.text import sanitize_html, SAFE, SANITIZE_LEVELS DTFMT = "%b %d, %Y at %H:%M" -RAW = 'raw' -SAFE = 'safe' -TEXT = 'text' -SANITIZE_LEVELS = (RAW, SAFE, TEXT) -SCHEMES = ('json', 'html') + SANITIZE_LEVELS +EXPORT_FORMATS = ('json', 'html') +SCHEMES = EXPORT_FORMATS + SANITIZE_LEVELS State = Enum('State', 'Init, Started, Finished') @@ -222,7 +214,7 @@ def export(self, root=None, categories=None, level=SAFE): with codecs.open(path, 'w', encoding='utf-8') as f: action = { 'json': lambda: post.to_json(indent=2), - 'html': lambda: self.sanitize_html(post.htmlize(), level), + 'html': lambda: post.htmlize(sanitize=level) }[self.scheme] f.write(action()) @@ -232,57 +224,6 @@ def export(self, root=None, categories=None, level=SAFE): self.readme(os.path.join(self.root, "README")) self.feedinfo(os.path.join(self.root, "feeds.json")) - def sanitize_html(self, html, level): - """ - Return a sanitized version of html content - :param html: the content to sanitized - :param level: the type of sanitization - one of ['raw', 'safe', 'text', None] - :return: sanitized content - """ - if level == SAFE: - return self._get_safe_html(html) - elif level == RAW: - return self._get_raw_html(html) - elif level == TEXT: - return self._get_text_from_html(html) - elif level is None: - return html - - raise ExportError( - "{level} is not a supported sanitize_html level.".format( - level=level - ) - ) - - def _get_raw_html(self, html): - """ - :param html: html content - :return: the unmodified html - """ - return html - - def _get_safe_html(self, html): - """ - Applies Readability's sanitize() method to content. - :param html: the content to sanitize - :return: the body of the html content minus html tags - """ - return Document(html).summary() - - def _get_text_from_html(self, html): - """ - Applies the 'safe' level of sanitization, removes newlines, - and converts the html entity for ampersand into the ampersand character. - :param html: the content to sanitize - :return: sanitized content - """ - text = self._get_safe_html(html) - text = bleach.clean(text, tags=[], strip=True) - text = text.strip() - text = text.replace("\n", "") - text = text.replace("&", "&") - return text - if __name__ == '__main__': import baleen.models as db diff --git a/baleen/models.py b/baleen/models.py index b64a0ea..8935794 100644 --- a/baleen/models.py +++ b/baleen/models.py @@ -24,6 +24,8 @@ from baleen.config import settings from baleen.utils.cryptography import hash_string from baleen.utils.timez import humanizedelta +from baleen.utils.text import sanitize_html, SAFE, RAW, SANITIZE_LEVELS + ########################################################################## ## Module Constants @@ -134,13 +136,15 @@ def hash(self): """ return hash_string(self.content) - def htmlize(self): + def htmlize(self, sanitize=None): """ - Returns an HTML string of the content of the Post. - In the future we may use bleach to do sanitization or other simple - sanity checks to ensure that things are going ok, which is why this - method stub exists. + Returns the content of the Post with html sanitized + :param sanitize: the level of sanitizing, default to None + :return: the content """ + if sanitize: + return sanitize_html(html=self.content, level=sanitize) + return self.content def __unicode__(self): diff --git a/baleen/utils/text.py b/baleen/utils/text.py new file mode 100644 index 0000000..fb31bae --- /dev/null +++ b/baleen/utils/text.py @@ -0,0 +1,90 @@ +# baleen.utils.text +# Utility functions for Baleen +# +# Author: Benjamin Bengfort +# Created: Sat Jun 03 18:48:00 2017 -0400 +# +# Copyright (C) 2017 Bengfort.com +# For license information, see LICENSE.txt +# +# ID: text.py [caaaaca] benjamin@bengfort.com $ + +""" +Text-related Utility functions for Baleenc +""" + +########################################################################## +## Imports +########################################################################## + +import bleach +from readability.readability import Document + +########################################################################## +## Constants +########################################################################## + +RAW = 'raw' +SAFE = 'safe' +TEXT = 'text' +SANITIZE_LEVELS = (RAW, SAFE, TEXT) + + +def get_raw_html(html): + """ + :param html: html content + :return: the unmodified html + """ + return html + + +def get_safe_html(html): + """ + Applies Readability's sanitize() method to content. + :param html: the content to sanitize + :return: the body of the html content minus html tags + """ + if html is None: + return None + return Document(html).summary() + + +def get_text_from_html(html): + """ + Applies the 'safe' level of sanitization, removes newlines, + and converts the html entity for ampersand into the ampersand character. + :param html: the content to sanitize + :return: sanitized content + """ + if html is None: + return html + + text = get_safe_html(html) + text = bleach.clean(text, tags=[], strip=True) + text = text.strip() + text = text.replace("\n", "") + text = text.replace("&", "&") + return text + + +def sanitize_html(html, level): + """ + Return a sanitized version of html content + :param html: the content to sanitized + :param level: the type of sanitization - one of ['raw', 'safe', 'text', None] + :return: sanitized content + """ + if level == SAFE: + return get_safe_html(html) + elif level == RAW: + return get_raw_html(html) + elif level == TEXT: + return get_text_from_html(html) + elif level is None: + return html + + raise ValueError( + "{level} is not a supported sanitize_html level.".format( + level=level + ) + ) diff --git a/tests/test_export.py b/tests/test_export.py index 53a1be4..888112e 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -18,20 +18,13 @@ ########################################################################## import unittest -import logging +from unittest.mock import MagicMock from mongomock import MongoClient as MockMongoClient -from unittest import mock -from unittest.mock import MagicMock from baleen.export import * from baleen.feed import * from baleen.models import connect -from baleen.exceptions import ExportError - -########################################################################## -## Fixtures -########################################################################## BOOKS_FEED = Feed( title='The Rumpus.net', @@ -254,57 +247,3 @@ def test_export_with_category_path_failure(self): exporter.export() -class SanitizeHtmlTests(unittest.TestCase): - """ Tests the exporter's HTML sanitize methods """ - - @classmethod - def setUpClass(cls): - cls.sample_html = ('' - '' - 'body &\n mind' - '') - - cls.conn = connect(host='mongomock://localhost') - assert isinstance(cls.conn, MockMongoClient) - root_path = '/tmp/corpus' - cls.exporter = MongoExporter(root_path, categories=CATEGORIES_IN_DB) - - @classmethod - def tearDownClass(self): - """ - Drop the mongomock connection - """ - assert isinstance(self.conn, MockMongoClient) - self.conn = None - - def test_sanitize_requires_a_valid_level(self): - """ Sanitize_html requires a supported level """ - with self.assertRaises(ExportError): - self.exporter.sanitize_html(self.sample_html, "bogus") - - def test_sanitize_returns_input_for_level_none(self): - """ sanitize_html returns unmodified input for level None """ - self.assertEqual(self.exporter.sanitize_html(self.sample_html, None), self.sample_html) - - def test_sanitize_raw(self): - """ Sanitize level raw returns the content as submitted """ - self.assertEqual(self.exporter.sanitize_html(self.sample_html, RAW), self.sample_html) - - def test_sanitize_safe(self): - """ Sanitize level safe applies Readability and returns the body """ - - # Give Readability a simpler HTML sample to keep its parse strategy simple - sample_html = ('' - '' - 'body' - '') - expected = 'body' - self.assertEqual(self.exporter.sanitize_html(sample_html, SAFE), expected) - - def test_sanitize_text(self): - """ - Sanitize level text strips HTML tags, removes newlines, - and converts the html entity ampersand into an ampersand character - """ - expected = 'body & mind' - self.assertEqual(self.exporter.sanitize_html(self.sample_html, TEXT), expected) diff --git a/tests/utils_tests/test_text.py b/tests/utils_tests/test_text.py new file mode 100644 index 0000000..f383db9 --- /dev/null +++ b/tests/utils_tests/test_text.py @@ -0,0 +1,91 @@ +# test.utils_tests.test_text +# Testing for the text helpers library. +# +# Author: Benjamin Bengfort +# Created: Sat Jun 03 18:48:00 2017 -0400 +# +# Copyright (C) 2017 Bengfort.com +# For license information, see LICENSE.txt +# +# ID: test_text.py [df0c71b] benjamin@bengfort.com $ + +""" +Testing for the text helpers library. +""" + +########################################################################## +## Imports +########################################################################## + +import unittest + +from baleen.exceptions import ExportError +from baleen.utils.text import sanitize_html, RAW, SAFE, TEXT + + +class SanitizeHtmlTests(unittest.TestCase): + """ Tests the exporter's HTML sanitize methods """ + + @classmethod + def setUpClass(cls): + cls.sample_html = ('' + '' + 'body &\n mind' + '') + + @classmethod + def tearDownClass(self): + """ + Drop the mongomock connection + """ + pass + + def test_sanitize_requires_a_valid_level(self): + """ Sanitize_html requires a supported level """ + with self.assertRaises(ValueError): + sanitize_html(self.sample_html, "bogus") + + def test_sanitize_returns_input_for_level_none(self): + """ sanitize_html returns unmodified input for level None """ + self.assertEqual(sanitize_html(self.sample_html, None), self.sample_html) + + def test_sanitize_raw(self): + """ Sanitize level raw returns the content as submitted """ + self.assertEqual(sanitize_html(self.sample_html, RAW), self.sample_html) + + def test_sanitize_raw_handles_none(self): + """ + Sanitize level raw accepts None gracefully + """ + self.assertEqual(sanitize_html(None, RAW), None) + + def test_sanitize_safe(self): + """ Sanitize level safe applies Readability and returns the body """ + + # Give Readability a simpler HTML sample to keep its parse strategy simple + sample_html = ('' + '' + 'body' + '') + expected = 'body' + self.assertEqual(sanitize_html(sample_html, SAFE), expected) + + def test_sanitize_safe_handles_none(self): + """ + Sanitize level safe accepts None gracefully + """ + self.assertEqual(sanitize_html(None, SAFE), None) + + def test_sanitize_text(self): + """ + Sanitize level text strips HTML tags, removes newlines, + and converts the html entity ampersand into an ampersand character + """ + expected = 'body & mind' + self.assertEqual(sanitize_html(self.sample_html, TEXT), expected) + + def test_sanitize_text_handles_none(self): + """ + Sanitize level text accepts None gracefully + """ + self.assertEqual(sanitize_html(None, TEXT), None)