refactor html formatting for nitf/ninjs (#455)

* refactor html formatting for nitf/ninjs so there is similar logic in both set ninjs copyrightholder to NTB SDNTB-811
superdesk · Apr 26, 2023 · b37e09a · b37e09a
1 parent 35d3efe
commit b37e09a
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 93 deletions.
diff --git a/server/ntb/publish/ntb_ninjs.py b/server/ntb/publish/ntb_ninjs.py
@@ -1,7 +1,12 @@
+import lxml.etree as etree
+
 from flask import g
 from typing import Dict, List
+from lxml.html import HTMLParser
 from superdesk import get_resource_service
+from superdesk.etree import clean_html, to_string
 from superdesk.publish.formatters.ninjs_formatter import NINJSFormatter
+from superdesk.text_utils import get_char_count, get_word_count
 
 from . import utils
 
@@ -58,15 +63,18 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True):
         if ninjs.get("description_text"):
             ninjs["descriptions"] = self.format_descriptions(ninjs)
 
-        if ninjs.get("body_html"):
-            ninjs["bodies"] = self.format_bodies(ninjs)
+        if article.get("body_html"):
+            ninjs["bodies"] = self.format_bodies(article)
 
         if ninjs.get("subject"):
             ninjs["subjects"] = self.format_subjects(ninjs)
 
         if ninjs.get("place"):
             ninjs["places"] = ninjs["place"]
 
+        if ninjs.get("guid"):
+            ninjs.setdefault("uri", ninjs["guid"])
+
         # removed items which mapped according to Ninjs v2 properties
         ninjs_properties = [
             "headlines",
@@ -84,7 +92,6 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True):
             "copyrightnotice",
             "usageterms",
             "ednote",
-            "guid",
             "language",
             "descriptions",
             "bodies",
@@ -99,14 +106,14 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True):
             "by",
             "slugline",
             "located",
-            "renditions",
             "associations",
             "altids",
             "trustindicators",
             "standard",
             "genre",
             "rightsinfo",
             "service",
+            "infosources",
         ]
 
         for key in list(ninjs.keys()):
@@ -131,9 +138,16 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True):
             for tagline in article["sign_off"].split("/"):
                 ninjs["taglines"].append(tagline.strip())
 
+        if article.get("type") == "text":
+            ninjs["infosources"] = [
+                {"name": utils.get_distributor(article)},
+            ]
+
         if recursive:  # should only run at the end, so do this on top level item only
             convert_dicts_to_lists(ninjs)
 
+        ninjs["copyrightholder"] = "NTB"
+
         return ninjs
 
     def _format_place(self, article) -> List[Dict]:
@@ -157,7 +171,6 @@ def _format_place(self, article) -> List[Dict]:
         return places
 
     def _format_rendition(self, rendition):
-        print("IN", rendition)
         formatted = super()._format_rendition(rendition)
         if formatted.get("mimetype"):
             formatted["contenttype"] = formatted.pop("mimetype")
@@ -189,13 +202,21 @@ def format_headlines(self, article):
     def format_descriptions(self, ninjs):
         return [{"value": ninjs.get("description_text"), "contenttype": "text/plain"}]
 
-    def format_bodies(self, ninjs):
+    def format_bodies(self, article):
+        html, _ = utils.format_body_content(article)
+        parser = HTMLParser(recover=True, remove_blank_text=True)
+        try:
+            html_tree = etree.fromstring(html, parser)
+        except Exception as e:
+            raise ValueError("Can't parse body_html content: {}".format(e))
+        html_tree_clean = clean_html(html_tree)
+        html = to_string(html_tree_clean, method="html", remove_root_div=True)
         return [
             {
-                "charcount": ninjs.get("charcount"),
-                "wordcount": ninjs.get("wordcount"),
-                "value": ninjs.get("body_html"),
-                "contenttype": "text/plain",
+                "charcount": get_char_count(html),
+                "wordcount": get_word_count(html),
+                "value": html,
+                "contenttype": "text/html",
             }
         ]
 

diff --git a/server/ntb/publish/ntb_nitf.py b/server/ntb/publish/ntb_nitf.py
@@ -24,30 +24,19 @@
 from superdesk.publish.formatters.nitf_formatter import NITFFormatter, EraseElement
 from superdesk.publish.publish_service import PublishService
 from superdesk.errors import FormatterError
-from superdesk.cache import cache
 from superdesk.text_utils import get_text
 
 from . import utils
 
 logger = logging.getLogger(__name__)
 tz = None
 
-EMBED_RE = re.compile(
-    r"<!-- EMBED START ([a-zA-Z]+ {id: \"(?P<id>.+?)\"}) -->.*"
-    r"<!-- EMBED END \1 -->",
-    re.DOTALL,
-)
+
 FILENAME_FORBIDDEN_RE = re.compile(r"[^a-zA-Z0-9._-]")
-STRIP_INVALID_CHARS_RE = re.compile("[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]")
 ENCODING = "iso-8859-1"
-LANGUAGE = "nb-NO"  # default language for ntb
 assert ENCODING != "unicode"  # use e.g. utf-8 for unicode
 
 
-def _get_language(article):
-    return article.get("language") or LANGUAGE
-
-
 def get_content_field(article, field):
     content_type = get_resource_service("content_types").find_one(
         req=None, _id=article["profile"]
@@ -80,19 +69,14 @@ def can_format(self, format_type, article):
             format_type == self.FORMAT_TYPE and article[ITEM_TYPE] == CONTENT_TYPE.TEXT
         )
 
-    def strip_invalid_chars(self, string):
-        if string is None:
-            string = ""
-        return STRIP_INVALID_CHARS_RE.sub("", string)
-
     def format(self, original_article, subscriber, codes=None, encoding="us-ascii"):
         article = deepcopy(original_article)
         self._populate_metadata(article)
         global tz
         if tz is None:
             # first time this method is launched
             # we set timezone and NTB specific filter
-            tz = pytz.timezone(superdesk.app.config["DEFAULT_TIMEZONE"])
+            tz = pytz.timezone(app.config["DEFAULT_TIMEZONE"])
         try:
             if article.get("body_html"):
                 article["body_html"] = article["body_html"].replace("<br>", "<br />")
@@ -101,7 +85,7 @@ def format(self, original_article, subscriber, codes=None, encoding="us-ascii"):
             )
             nitf = self.get_nitf(article, subscriber, pub_seq_num)
             try:
-                nitf.attrib["baselang"] = _get_language(article)
+                nitf.attrib["baselang"] = utils.get_language(article)
             except KeyError:
                 pass
 
@@ -448,11 +432,7 @@ def _format_body_head_dateline(self, article, body_head):
     def _format_body_head_distributor(self, article, body_head):
         distrib = etree.SubElement(body_head, "distributor")
         org = etree.SubElement(distrib, "org")
-        language = _get_language(article)
-        if language == "nb-NO":
-            org.text = "NTB"
-        elif language == "nn-NO":
-            org.text = "NPK"
+        org.text = utils.get_distributor(article)
 
     def _add_media(
         self,
@@ -503,48 +483,7 @@ def _format_body_content(self, article, body_content):
             abstract_txt = etree.tostring(abstract, encoding="unicode", method="text")
             p.text = abstract_txt
 
-        # media
-        media_data = []
-        try:
-            associations = article["associations"]
-        except KeyError:
-            pass
-        else:
-            feature_image = associations.get("featureimage")
-            if feature_image is not None:
-                feature_image["_featured"] = "image"
-                media_data.append(feature_image)
-            else:
-                feature_media = associations.get("featuremedia")
-                if feature_media is not None:
-                    feature_media["_featured"] = "media"
-                    media_data.append(feature_media)
-
-        def repl_embedded(match):
-            """Embedded in body_html handling"""
-            # this method do 2 important things:
-            # - it remove the embedded from body_html
-            # - it fill media_data with embedded data in order of appearance
-            id_ = match.group("id")
-            try:
-                data = associations[id_]
-            except KeyError:
-                logger.warning("Expected association {} not found!".format(id_))
-            else:
-                if data is None:
-                    logger.warning(
-                        "media data for association {} is empty, ignoring!".format(id_)
-                    )
-                else:
-                    media_data.append(data)
-            return ""
-
-        html = self.strip_invalid_chars(
-            EMBED_RE.sub(repl_embedded, article.get("body_html") or "")
-        )
-        # it is a request from SDNTB-388 to use normal space instead of non breaking spaces
-        # so we do this replace
-        html = html.replace("&nbsp;", " ")
+        html, media_data = utils.format_body_content(article)
 
         # at this point we have media data filled in right order
         # and no more embedded in html
@@ -625,7 +564,7 @@ def repl_embedded(match):
                     if type_ == "image" or type_ == "grafikk"
                     else "video/mpeg"
                 )
-            caption = self.strip_invalid_chars(data.get("description_text"))
+            caption = utils.strip_invalid_chars(data.get("description_text"))
             self._add_media(body_content, type_, mime_type, source, caption, featured)
         media_counter = len(media_data)
 

diff --git a/server/ntb/publish/utils.py b/server/ntb/publish/utils.py
@@ -1,4 +1,29 @@
 
+import re
+import logging
+import superdesk.etree as sd_etree
+
+from lxml import etree
+from typing import Dict, List, Optional, Tuple
+
+
+LANGUAGE = "nb-NO"  # default language for ntb
+
+EMBED_RE = re.compile(
+    r"<!-- EMBED START ([a-zA-Z]+ {id: \"(?P<id>.+?)\"}) -->.*"
+    r"<!-- EMBED END \1 -->",
+    re.DOTALL,
+)
+
+STRIP_INVALID_CHARS_RE = re.compile("[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]")
+
+logger = logging.getLogger(__name__)
+
+
+def get_language(article) -> str:
+    return article.get("language") or LANGUAGE
+
+
 def get_rewrite_sequence(article) -> int:
     return int(article.get("rewrite_sequence") or 0)
 
@@ -12,3 +37,64 @@ def get_doc_id(article) -> str:
         ntb_id=get_ntb_id(article),
         version=get_rewrite_sequence(article),
     )
+
+
+def get_distributor(article) -> str:
+    language = get_language(article)
+    if language == "nn-NO":
+        return "NPK"
+    return "NTB"
+
+
+def strip_invalid_chars(string: Optional[str]) -> str:
+    if not string:
+        return ""
+    return STRIP_INVALID_CHARS_RE.sub("", string)
+
+
+def format_body_content(article) -> Tuple[str, List[Dict]]:
+    # media
+    media_data = []
+    try:
+        associations = article["associations"]
+    except KeyError:
+        pass
+    else:
+        feature_image = associations.get("featureimage")
+        if feature_image is not None:
+            feature_image["_featured"] = "image"
+            media_data.append(feature_image)
+        else:
+            feature_media = associations.get("featuremedia")
+            if feature_media is not None:
+                feature_media["_featured"] = "media"
+                media_data.append(feature_media)
+
+    def repl_embedded(match):
+        """Embedded in body_html handling"""
+        # this method do 2 important things:
+        # - it remove the embedded from body_html
+        # - it fill media_data with embedded data in order of appearance
+        id_ = match.group("id")
+        try:
+            data = associations[id_]
+        except KeyError:
+            logger.warning("Expected association {} not found!".format(id_))
+        else:
+            if data is None:
+                logger.warning(
+                    "media data for association {} is empty, ignoring!".format(id_)
+                )
+            else:
+                media_data.append(data)
+        return ""
+
+    html = strip_invalid_chars(
+        EMBED_RE.sub(repl_embedded, article.get("body_html") or "")
+    )
+
+    # it is a request from SDNTB-388 to use normal space instead of non breaking spaces
+    # so we do this replace
+    html = html.replace("&nbsp;", " ")
+
+    return html, media_data