Remove embeds from selected downloads

superdesk · Oct 16, 2023 · 7511520 · 7511520
1 parent 4847ea3
commit 7511520
Show file tree

Hide file tree

Showing 9 changed files with 97 additions and 43 deletions.
diff --git a/features/news_api_item.feature b/features/news_api_item.feature
@@ -204,4 +204,43 @@ Feature: News API Item
       "headline": "headline 1",
       "associations": {"featuremedia": {"renditions": {"original": {}} }}
     }
+    """
+
+  Scenario: Item request response strips embeds
+    Given "items"
+        """
+        [{"_id": "111", "body_html": "<p>Once upon a time there was </p><div class=\"embed-block\">a fish</div><p> who could swim</p><p><!-- EMBED START Image {id: \"editor_19\"} --><figure><img src=\"somthing\" alt=\"alt text\" id=\"editor_19\"<figcaption>Some caption</figcaption></figure><!-- EMBED END Image {id: \"editor_19\"} --></p>",
+         "headline": "headline 1",
+         "firstpublished": "#DATE-1#", "versioncreated": "#DATE#",
+         "associations": {"editor_19": {"products": [{"code": "1234"}], "renditions": {"original": {}} }}}]
+        """
+    Given "products"
+        """
+        [{"name": "A fishy Product",
+        "decsription": "a product for those interested in fish",
+        "companies" : [
+          "#companies._id#"
+        ],
+        "query": "Once upon a time",
+        "product_type": "news_api"
+        },
+        {"name": "A fishy superdesk product",
+        "description": "a superdesk product restricting images in the atom feed",
+        "companies" : [
+          "#companies._id#"
+        ],
+        "sd_product_id": "1234",
+        "product_type": "news_api"
+        }
+        ]
+        """
+    When we get "/news/item/111?format=NINJSFormatter3"
+    Then we get existing resource
+    """
+    {
+      "guid": "111",
+      "headline": "headline 1",
+      "body_html": "<p>Once upon a time there was </p><p> who could swim</p><p></p>",
+      "associations": {}
+    }
     """
diff --git a/newsroom/monitoring/utils.py b/newsroom/monitoring/utils.py
@@ -1,9 +1,7 @@
 from flask import current_app as app
-from lxml import html as lxml_html
-import re
 import collections
 from superdesk.text_utils import get_text
-from newsroom.utils import get_items_by_id
+from newsroom.utils import get_items_by_id, remove_all_embeds
 from superdesk import etree as sd_etree
 
 
@@ -69,28 +67,3 @@ def get_items_for_monitoring_report(_ids, monitoring_profile, full_text=False):
     items = get_items_by_id(_ids, 'items')
     truncate_article_body(items, monitoring_profile, full_text)
     return items
-
-
-def remove_all_embeds(item):
-    """
-    Remove the all embeds from the body of the article
-    :param item:
-    :return:
-    """
-    root_elem = lxml_html.fromstring(item.get('body_html') or '<p></p>')
-    regex = r" EMBED START (?:Image|Video|Audio) {id: \"editor_([0-9]+)"
-    html_updated = False
-    comments = root_elem.xpath('//comment()')
-    for comment in comments:
-        m = re.search(regex, comment.text)
-        # if we've found an Embed Start comment
-        if m and m.group(1):
-            parent = comment.getparent()
-            for elem in comment.itersiblings():
-                parent.remove(elem)
-                if elem.text and ' EMBED END ' in elem.text:
-                    break
-            parent.remove(comment)
-            html_updated = True
-    if html_updated:
-        item["body_html"] = sd_etree.to_string(root_elem, method="html")
diff --git a/newsroom/utils.py b/newsroom/utils.py
@@ -5,6 +5,7 @@
 import pytz
 import re
 from lxml import html as lxml_html
+from lxml.html import clean
 
 from superdesk.etree import to_string
 from superdesk.utc import utcnow
@@ -460,3 +461,36 @@ def update_embeds_in_body(item, update_image=None, update_audio=None, update_vid
                     body_updated = update_video(item, elem, m.group(1)) or body_updated
     if body_updated:
         item['body_html'] = to_string(root_elem, method="html")
+
+
+def remove_all_embeds(item):
+    """
+    Remove the all embeds from the body of the article, including any divs with the embed_block attribute
+    :param item:
+    :return:
+    """
+
+    if not item.get("body_html", ""):
+        return
+
+    # clean all the embedded figures from the html
+    blacklist = ["figure"]
+    root_elem = lxml_html.fromstring(item.get("body_html", ""))
+
+    cleaner = clean.Cleaner(
+        add_nofollow=False,
+        kill_tags=blacklist
+    )
+    cleaned_xhtml = cleaner.clean_html(root_elem)
+
+    # all embedded tweets etc should be in a div with the class embeded-block, these are removed
+    embeds = cleaned_xhtml.xpath('//div[@class=\'embed-block\']')
+    for embed in embeds:
+        cleaned_xhtml.remove(embed)
+
+    # remove the associations relating to the embeds
+    kill_keys = [key for key in item.get("associations", {}) if key.startswith("editor_")]
+    for key in kill_keys:
+        item.get("associations", {}).pop(key, None)
+
+    item["body_html"] = to_string(cleaned_xhtml, encoding="unicode", method='html')
diff --git a/newsroom/wire/formatters/html.py b/newsroom/wire/formatters/html.py
@@ -1,8 +1,6 @@
 import flask
 from .base import BaseFormatter
-from lxml import html as lxml_html
-from lxml.html import clean
-from lxml import etree
+from newsroom.utils import remove_all_embeds
 
 
 class HTMLFormatter(BaseFormatter):
@@ -14,17 +12,7 @@ class HTMLFormatter(BaseFormatter):
     MIMETYPE = 'text/html'
 
     def format_item(self, item, item_type='items'):
-
-        # clean all the embedded figures from the html
-        blacklist = ["figure"]
-        root_elem = lxml_html.fromstring(item.get("body_html", ""))
-        cleaner = clean.Cleaner(
-            add_nofollow=False,
-            kill_tags=blacklist
-        )
-        cleaned_xhtml = cleaner.clean_html(root_elem)
-
-        item["body_html"] = etree.tostring(cleaned_xhtml, encoding="unicode", method='html')
+        remove_all_embeds(item)
 
         if item_type == 'items':
             return str.encode(flask.render_template('download_item.html', item=item), 'utf-8')

diff --git a/newsroom/wire/formatters/newsmlg2.py b/newsroom/wire/formatters/newsmlg2.py
@@ -4,6 +4,7 @@
 
 from superdesk.publish.formatters.nitf_formatter import NITFFormatter
 from superdesk.publish.formatters.newsml_g2_formatter import NewsMLG2Formatter as SuperdeskFormatter
+from newsroom.utils import remove_all_embeds
 
 from .base import BaseFormatter
 
@@ -34,6 +35,7 @@ class NewsMLG2Formatter(BaseFormatter):
     nitf_formatter = NITFFormatter()
 
     def format_item(self, item, item_type='items'):
+        remove_all_embeds(item)
         item = item.copy()
         item.setdefault('guid', item['_id'])
         item.setdefault('_current_version', item['version'])

diff --git a/newsroom/wire/formatters/ninjs2.py b/newsroom/wire/formatters/ninjs2.py
@@ -1,6 +1,7 @@
 from .ninjs import NINJSFormatter
 from newsroom.news_api.utils import check_featuremedia_association_permission
 from newsroom.wire.formatters.utils import remove_internal_renditions
+from newsroom.utils import remove_all_embeds
 
 
 class NINJSFormatter2(NINJSFormatter):
@@ -18,3 +19,15 @@ def _transform_to_ninjs(self, item):
             if not item.get('associations'):
                 item.pop('associations', None)
         return remove_internal_renditions(super()._transform_to_ninjs(item), remove_media=True)
+
+
+class NINJSFormatter3(NINJSFormatter2):
+    """
+    Format with no Embeds
+    """
+
+    def _transform_to_ninjs(self, item):
+        remove_all_embeds(item)
+        ninjs = super()._transform_to_ninjs(item)
+        # do stuff
+        return ninjs
diff --git a/newsroom/wire/formatters/nitf.py b/newsroom/wire/formatters/nitf.py
@@ -1,6 +1,7 @@
 
 from lxml import etree
 from superdesk.publish.formatters.nitf_formatter import NITFFormatter as SuperdeskNITFFormatter
+from newsroom.utils import remove_all_embeds
 
 from .base import BaseFormatter
 
@@ -14,6 +15,7 @@ class NITFFormatter(BaseFormatter):
     formatter = SuperdeskNITFFormatter()
 
     def format_item(self, item, item_type='items'):
+        remove_all_embeds(item)
         dest = {}
         nitf = self.formatter.get_nitf(item, dest, '')
         return etree.tostring(nitf, xml_declaration=True, pretty_print=True, encoding=self.encoding)
diff --git a/newsroom/wire/formatters/text.py b/newsroom/wire/formatters/text.py
@@ -1,6 +1,7 @@
 
 import flask
 from .base import BaseFormatter
+from newsroom.utils import remove_all_embeds
 
 
 class TextFormatter(BaseFormatter):
@@ -9,6 +10,7 @@ class TextFormatter(BaseFormatter):
     MIMETYPE = 'text/plain'
 
     def format_item(self, item, item_type='items'):
+        remove_all_embeds(item)
         if item_type == 'items':
             return str.encode(flask.render_template('download_item.txt', item=item), 'utf-8')
         else:

diff --git a/newsroom/wire/views.py b/newsroom/wire/views.py
@@ -253,7 +253,8 @@ def download(_ids):
 
     update_action_list(_ids.split(','), 'downloads', force_insert=True)
     get_resource_service('history').create_history_record(items, 'download', user, request.args.get('type', 'wire'))
-    return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True)
+    return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True,
+                           cache_timeout=0)
 
 
 @blueprint.route('/wire_share', methods=['POST'])