Issue #87 Make sanitization happen in Post.htmlize() (#88)

* Issue #87 Make sanitization happen in Post.htmlize() * Made utils/text. Moved sanitize functions, constants, and tests there. * Post.htmlize() accepts and applies a sanitization param * Export passes along the level to to Post. * Incorporate PR feedback - default sanitize to none * Issue #87 Make sanitization happen in Post.htmlize() Clean up imports and comments in tests. * Issue #87 Make sanitization happen in Post.htmlize() * Add docstrings to the top * move constants to top * rename _get_ methods to get_ - they're not hidden inside a class, may as well make them available directly to those that want it * Fix test; tweak docs
DistrictDataLabs · Jun 12, 2017 · 88d5d7c · 88d5d7c
1 parent b89caa2
commit 88d5d7c
Show file tree

Hide file tree

Showing 5 changed files with 201 additions and 136 deletions.
diff --git a/baleen/export.py b/baleen/export.py
@@ -17,30 +17,22 @@
 ## Imports
 ##########################################################################
 
-import os
 import codecs
-import bleach
-
-from enum import Enum
+import os
+from collections import Counter
 from datetime import datetime
-from baleen.models import Feed, Post
+from enum import Enum
+from operator import itemgetter
 
 from tqdm import tqdm
-from collections import Counter
-from operator import itemgetter
-from baleen.exceptions import ExportError
-from readability.readability import Document
 
-##########################################################################
-## Module Constants
-##########################################################################
+from baleen.exceptions import ExportError
+from baleen.models import Feed, Post
+from baleen.utils.text import sanitize_html, SAFE, SANITIZE_LEVELS
 
 DTFMT = "%b %d, %Y at %H:%M"
-RAW = 'raw'
-SAFE = 'safe'
-TEXT = 'text'
-SANITIZE_LEVELS = (RAW, SAFE, TEXT)
-SCHEMES = ('json', 'html') + SANITIZE_LEVELS
+EXPORT_FORMATS = ('json', 'html')
+SCHEMES = EXPORT_FORMATS + SANITIZE_LEVELS
 State = Enum('State', 'Init, Started, Finished')
 
 
@@ -222,7 +214,7 @@ def export(self, root=None, categories=None, level=SAFE):
             with codecs.open(path, 'w', encoding='utf-8') as f:
                 action = {
                     'json': lambda: post.to_json(indent=2),
-                    'html': lambda: self.sanitize_html(post.htmlize(), level),
+                    'html': lambda: post.htmlize(sanitize=level)
                 }[self.scheme]
 
                 f.write(action())
@@ -232,57 +224,6 @@ def export(self, root=None, categories=None, level=SAFE):
         self.readme(os.path.join(self.root, "README"))
         self.feedinfo(os.path.join(self.root, "feeds.json"))
 
-    def sanitize_html(self, html, level):
-        """
-        Return a sanitized version of html content
-        :param html: the content to sanitized
-        :param level: the type of sanitization - one of ['raw', 'safe', 'text', None]
-        :return: sanitized content
-        """
-        if level == SAFE:
-            return self._get_safe_html(html)
-        elif level == RAW:
-            return self._get_raw_html(html)
-        elif level == TEXT:
-            return self._get_text_from_html(html)
-        elif level is None:
-            return html
-
-        raise ExportError(
-            "{level} is not a supported sanitize_html level.".format(
-                level=level
-            )
-        )
-
-    def _get_raw_html(self, html):
-        """
-        :param html: html content
-        :return: the unmodified html
-        """
-        return html
-
-    def _get_safe_html(self, html):
-        """
-        Applies Readability's sanitize() method to content.
-        :param html: the content to sanitize
-        :return: the body of the html content minus html tags
-        """
-        return Document(html).summary()
-
-    def _get_text_from_html(self, html):
-        """
-        Applies the 'safe' level of sanitization, removes newlines,
-        and converts the html entity for ampersand into the ampersand character.
-        :param html: the content to sanitize
-        :return: sanitized content
-        """
-        text = self._get_safe_html(html)
-        text = bleach.clean(text, tags=[], strip=True)
-        text = text.strip()
-        text = text.replace("\n", "")
-        text = text.replace("&amp;", "&")
-        return text
-
 
 if __name__ == '__main__':
     import baleen.models as db

diff --git a/baleen/models.py b/baleen/models.py
@@ -24,6 +24,8 @@
 from baleen.config import settings
 from baleen.utils.cryptography import hash_string
 from baleen.utils.timez import humanizedelta
+from baleen.utils.text import sanitize_html, SAFE, RAW, SANITIZE_LEVELS
+
 
 ##########################################################################
 ## Module Constants
@@ -134,13 +136,15 @@ def hash(self):
         """
         return hash_string(self.content)
 
-    def htmlize(self):
+    def htmlize(self, sanitize=None):
         """
-        Returns an HTML string of the content of the Post.
-        In the future we may use bleach to do sanitization or other simple
-        sanity checks to ensure that things are going ok, which is why this
-        method stub exists.
+        Returns the content of the Post with html sanitized
+        :param sanitize: the level of sanitizing, default to None
+        :return: the content
         """
+        if sanitize:
+            return sanitize_html(html=self.content, level=sanitize)
+
         return self.content
 
     def __unicode__(self):

diff --git a/baleen/utils/text.py b/baleen/utils/text.py
@@ -0,0 +1,90 @@
+# baleen.utils.text
+# Utility functions for Baleen
+#
+# Author:   Benjamin Bengfort <[email protected]>
+# Created:  Sat Jun 03 18:48:00 2017 -0400
+#
+# Copyright (C) 2017 Bengfort.com
+# For license information, see LICENSE.txt
+#
+# ID: text.py [caaaaca] [email protected] $
+
+"""
+Text-related Utility functions for Baleenc
+"""
+
+##########################################################################
+## Imports
+##########################################################################
+
+import bleach
+from readability.readability import Document
+
+##########################################################################
+## Constants
+##########################################################################
+
+RAW = 'raw'
+SAFE = 'safe'
+TEXT = 'text'
+SANITIZE_LEVELS = (RAW, SAFE, TEXT)
+
+
+def get_raw_html(html):
+    """
+    :param html: html content
+    :return: the unmodified html
+    """
+    return html
+
+
+def get_safe_html(html):
+    """
+    Applies Readability's sanitize() method to content.
+    :param html: the content to sanitize
+    :return: the body of the html content minus html tags
+    """
+    if html is None:
+        return None
+    return Document(html).summary()
+
+
+def get_text_from_html(html):
+    """
+    Applies the 'safe' level of sanitization, removes newlines,
+    and converts the html entity for ampersand into the ampersand character.
+    :param html: the content to sanitize
+    :return: sanitized content
+    """
+    if html is None:
+        return html
+
+    text = get_safe_html(html)
+    text = bleach.clean(text, tags=[], strip=True)
+    text = text.strip()
+    text = text.replace("\n", "")
+    text = text.replace("&amp;", "&")
+    return text
+
+
+def sanitize_html(html, level):
+    """
+    Return a sanitized version of html content
+    :param html: the content to sanitized
+    :param level: the type of sanitization - one of ['raw', 'safe', 'text', None]
+    :return: sanitized content
+    """
+    if level == SAFE:
+        return get_safe_html(html)
+    elif level == RAW:
+        return get_raw_html(html)
+    elif level == TEXT:
+        return get_text_from_html(html)
+    elif level is None:
+        return html
+
+    raise ValueError(
+        "{level} is not a supported sanitize_html level.".format(
+            level=level
+        )
+    )
diff --git a/tests/test_export.py b/tests/test_export.py
@@ -18,20 +18,13 @@
 ##########################################################################
 
 import unittest
-import logging
+from unittest.mock import MagicMock
 
 from mongomock import MongoClient as MockMongoClient
-from unittest import mock
-from unittest.mock import MagicMock
 
 from baleen.export import *
 from baleen.feed import *
 from baleen.models import connect
-from baleen.exceptions import ExportError
-
-##########################################################################
-## Fixtures
-##########################################################################
 
 BOOKS_FEED = Feed(
     title='The Rumpus.net',
@@ -254,57 +247,3 @@ def test_export_with_category_path_failure(self):
                 exporter.export()
 
 
-class SanitizeHtmlTests(unittest.TestCase):
-    """ Tests the exporter's HTML sanitize methods """
-
-    @classmethod
-    def setUpClass(cls):
-        cls.sample_html = ('<html>'
-                           '<head><script>javascript here</script></head>'
-                           '<body><b>body &amp;\n mind</b></body>'
-                           '</html>')
-
-        cls.conn = connect(host='mongomock://localhost')
-        assert isinstance(cls.conn, MockMongoClient)
-        root_path = '/tmp/corpus'
-        cls.exporter = MongoExporter(root_path, categories=CATEGORIES_IN_DB)
-
-    @classmethod
-    def tearDownClass(self):
-        """
-        Drop the mongomock connection
-        """
-        assert isinstance(self.conn, MockMongoClient)
-        self.conn = None
-
-    def test_sanitize_requires_a_valid_level(self):
-        """  Sanitize_html requires a supported level """
-        with self.assertRaises(ExportError):
-            self.exporter.sanitize_html(self.sample_html, "bogus")
-
-    def test_sanitize_returns_input_for_level_none(self):
-        """  sanitize_html returns unmodified input for level None """
-        self.assertEqual(self.exporter.sanitize_html(self.sample_html, None), self.sample_html)
-
-    def test_sanitize_raw(self):
-        """  Sanitize level raw returns the content as submitted """
-        self.assertEqual(self.exporter.sanitize_html(self.sample_html, RAW), self.sample_html)
-
-    def test_sanitize_safe(self):
-        """  Sanitize level safe applies Readability and returns the body """
-
-        # Give Readability a simpler HTML sample to keep its parse strategy simple
-        sample_html = ('<html>'
-                       '<head><script>javascript here</script></head>'
-                       '<body>body</body>'
-                       '</html>')
-        expected = '<body id="readabilityBody">body</body>'
-        self.assertEqual(self.exporter.sanitize_html(sample_html, SAFE), expected)
-
-    def test_sanitize_text(self):
-        """
-        Sanitize level text strips HTML tags, removes newlines,
-         and converts the html entity ampersand into an ampersand character
-        """
-        expected = 'body & mind'
-        self.assertEqual(self.exporter.sanitize_html(self.sample_html, TEXT), expected)
diff --git a/tests/utils_tests/test_text.py b/tests/utils_tests/test_text.py
@@ -0,0 +1,91 @@
+# test.utils_tests.test_text
+# Testing for the text helpers library.
+#
+# Author:   Benjamin Bengfort <[email protected]>
+# Created:  Sat Jun 03 18:48:00 2017 -0400
+#
+# Copyright (C) 2017 Bengfort.com
+# For license information, see LICENSE.txt
+#
+# ID: test_text.py [df0c71b] [email protected] $
+
+"""
+Testing for the text helpers library.
+"""
+
+##########################################################################
+## Imports
+##########################################################################
+
+import unittest
+
+from baleen.exceptions import ExportError
+from baleen.utils.text import sanitize_html, RAW, SAFE, TEXT
+
+
+class SanitizeHtmlTests(unittest.TestCase):
+    """ Tests the exporter's HTML sanitize methods """
+
+    @classmethod
+    def setUpClass(cls):
+        cls.sample_html = ('<html>'
+                           '<head><script>javascript here</script></head>'
+                           '<body><b>body &amp;\n mind</b></body>'
+                           '</html>')
+
+    @classmethod
+    def tearDownClass(self):
+        """
+        Drop the mongomock connection
+        """
+        pass
+
+    def test_sanitize_requires_a_valid_level(self):
+        """  Sanitize_html requires a supported level """
+        with self.assertRaises(ValueError):
+            sanitize_html(self.sample_html, "bogus")
+
+    def test_sanitize_returns_input_for_level_none(self):
+        """  sanitize_html returns unmodified input for level None """
+        self.assertEqual(sanitize_html(self.sample_html, None), self.sample_html)
+
+    def test_sanitize_raw(self):
+        """  Sanitize level raw returns the content as submitted """
+        self.assertEqual(sanitize_html(self.sample_html, RAW), self.sample_html)
+
+    def test_sanitize_raw_handles_none(self):
+        """
+        Sanitize level raw accepts None gracefully
+        """
+        self.assertEqual(sanitize_html(None, RAW), None)
+
+    def test_sanitize_safe(self):
+        """  Sanitize level safe applies Readability and returns the body """
+
+        # Give Readability a simpler HTML sample to keep its parse strategy simple
+        sample_html = ('<html>'
+                       '<head><script>javascript here</script></head>'
+                       '<body>body</body>'
+                       '</html>')
+        expected = '<body id="readabilityBody">body</body>'
+        self.assertEqual(sanitize_html(sample_html, SAFE), expected)
+
+    def test_sanitize_safe_handles_none(self):
+        """
+        Sanitize level safe accepts None gracefully
+        """
+        self.assertEqual(sanitize_html(None, SAFE), None)
+
+    def test_sanitize_text(self):
+        """
+        Sanitize level text strips HTML tags, removes newlines,
+         and converts the html entity ampersand into an ampersand character
+        """
+        expected = 'body & mind'
+        self.assertEqual(sanitize_html(self.sample_html, TEXT), expected)
+
+    def test_sanitize_text_handles_none(self):
+        """
+        Sanitize level text accepts None gracefully
+        """
+        self.assertEqual(sanitize_html(None, TEXT), None)