From 9b0e7efbb6a18f69aa1f6b722a79a88a93cb8cd5 Mon Sep 17 00:00:00 2001 From: marwoodandrew Date: Mon, 16 Oct 2023 11:01:43 +1100 Subject: [PATCH 1/2] Remove embeds from selected downloads --- features/news_api_item.feature | 39 ++++++++++++++++++++++++++++ newsroom/monitoring/utils.py | 29 +-------------------- newsroom/utils.py | 34 ++++++++++++++++++++++++ newsroom/wire/formatters/html.py | 16 ++---------- newsroom/wire/formatters/newsmlg2.py | 2 ++ newsroom/wire/formatters/ninjs2.py | 12 +++++++++ newsroom/wire/formatters/nitf.py | 2 ++ newsroom/wire/formatters/text.py | 2 ++ newsroom/wire/views.py | 3 ++- 9 files changed, 96 insertions(+), 43 deletions(-) diff --git a/features/news_api_item.feature b/features/news_api_item.feature index 893de640d..78709c751 100644 --- a/features/news_api_item.feature +++ b/features/news_api_item.feature @@ -204,4 +204,43 @@ Feature: News API Item "headline": "headline 1", "associations": {"featuremedia": {"renditions": {"original": {}} }} } + """ + + Scenario: Item request response strips embeds + Given "items" + """ + [{"_id": "111", "body_html": "

Once upon a time there was

a fish

who could swim

\"altSome caption

", + "headline": "headline 1", + "firstpublished": "#DATE-1#", "versioncreated": "#DATE#", + "associations": {"editor_19": {"products": [{"code": "1234"}], "renditions": {"original": {}} }}}] + """ + Given "products" + """ + [{"name": "A fishy Product", + "decsription": "a product for those interested in fish", + "companies" : [ + "#companies._id#" + ], + "query": "Once upon a time", + "product_type": "news_api" + }, + {"name": "A fishy superdesk product", + "description": "a superdesk product restricting images in the atom feed", + "companies" : [ + "#companies._id#" + ], + "sd_product_id": "1234", + "product_type": "news_api" + } + ] + """ + When we get "/news/item/111?format=NINJSFormatter3" + Then we get existing resource + """ + { + "guid": "111", + "headline": "headline 1", + "body_html": "

Once upon a time there was

who could swim

", + "associations": {} + } """ \ No newline at end of file diff --git a/newsroom/monitoring/utils.py b/newsroom/monitoring/utils.py index 9d7799b99..b5f5c6115 100644 --- a/newsroom/monitoring/utils.py +++ b/newsroom/monitoring/utils.py @@ -1,9 +1,7 @@ from flask import current_app as app -from lxml import html as lxml_html -import re import collections from superdesk.text_utils import get_text -from newsroom.utils import get_items_by_id +from newsroom.utils import get_items_by_id, remove_all_embeds from superdesk import etree as sd_etree @@ -69,28 +67,3 @@ def get_items_for_monitoring_report(_ids, monitoring_profile, full_text=False): items = get_items_by_id(_ids, 'items') truncate_article_body(items, monitoring_profile, full_text) return items - - -def remove_all_embeds(item): - """ - Remove the all embeds from the body of the article - :param item: - :return: - """ - root_elem = lxml_html.fromstring(item.get('body_html') or '

') - regex = r" EMBED START (?:Image|Video|Audio) {id: \"editor_([0-9]+)" - html_updated = False - comments = root_elem.xpath('//comment()') - for comment in comments: - m = re.search(regex, comment.text) - # if we've found an Embed Start comment - if m and m.group(1): - parent = comment.getparent() - for elem in comment.itersiblings(): - parent.remove(elem) - if elem.text and ' EMBED END ' in elem.text: - break - parent.remove(comment) - html_updated = True - if html_updated: - item["body_html"] = sd_etree.to_string(root_elem, method="html") diff --git a/newsroom/utils.py b/newsroom/utils.py index 7a06beb5f..a842b25ec 100644 --- a/newsroom/utils.py +++ b/newsroom/utils.py @@ -5,6 +5,7 @@ import pytz import re from lxml import html as lxml_html +from lxml.html import clean from superdesk.etree import to_string from superdesk.utc import utcnow @@ -460,3 +461,36 @@ def update_embeds_in_body(item, update_image=None, update_audio=None, update_vid body_updated = update_video(item, elem, m.group(1)) or body_updated if body_updated: item['body_html'] = to_string(root_elem, method="html") + + +def remove_all_embeds(item): + """ + Remove the all embeds from the body of the article, including any divs with the embed_block attribute + :param item: + :return: + """ + + if not item.get("body_html", ""): + return + + # clean all the embedded figures from the html + blacklist = ["figure"] + root_elem = lxml_html.fromstring(item.get("body_html", "")) + + cleaner = clean.Cleaner( + add_nofollow=False, + kill_tags=blacklist + ) + cleaned_xhtml = cleaner.clean_html(root_elem) + + # all embedded tweets etc should be in a div with the class embeded-block, these are removed + embeds = cleaned_xhtml.xpath('//div[@class=\'embed-block\']') + for embed in embeds: + cleaned_xhtml.remove(embed) + + # remove the associations relating to the embeds + kill_keys = [key for key in item.get("associations", {}) if key.startswith("editor_")] + for key in kill_keys: + item.get("associations", {}).pop(key, None) + + item["body_html"] = to_string(cleaned_xhtml, encoding="unicode", method='html') diff --git a/newsroom/wire/formatters/html.py b/newsroom/wire/formatters/html.py index 78b3b7603..07e0c9304 100644 --- a/newsroom/wire/formatters/html.py +++ b/newsroom/wire/formatters/html.py @@ -1,8 +1,6 @@ import flask from .base import BaseFormatter -from lxml import html as lxml_html -from lxml.html import clean -from lxml import etree +from newsroom.utils import remove_all_embeds class HTMLFormatter(BaseFormatter): @@ -14,17 +12,7 @@ class HTMLFormatter(BaseFormatter): MIMETYPE = 'text/html' def format_item(self, item, item_type='items'): - - # clean all the embedded figures from the html - blacklist = ["figure"] - root_elem = lxml_html.fromstring(item.get("body_html", "")) - cleaner = clean.Cleaner( - add_nofollow=False, - kill_tags=blacklist - ) - cleaned_xhtml = cleaner.clean_html(root_elem) - - item["body_html"] = etree.tostring(cleaned_xhtml, encoding="unicode", method='html') + remove_all_embeds(item) if item_type == 'items': return str.encode(flask.render_template('download_item.html', item=item), 'utf-8') diff --git a/newsroom/wire/formatters/newsmlg2.py b/newsroom/wire/formatters/newsmlg2.py index 5a54102a8..80159b6ab 100644 --- a/newsroom/wire/formatters/newsmlg2.py +++ b/newsroom/wire/formatters/newsmlg2.py @@ -4,6 +4,7 @@ from superdesk.publish.formatters.nitf_formatter import NITFFormatter from superdesk.publish.formatters.newsml_g2_formatter import NewsMLG2Formatter as SuperdeskFormatter +from newsroom.utils import remove_all_embeds from .base import BaseFormatter @@ -34,6 +35,7 @@ class NewsMLG2Formatter(BaseFormatter): nitf_formatter = NITFFormatter() def format_item(self, item, item_type='items'): + remove_all_embeds(item) item = item.copy() item.setdefault('guid', item['_id']) item.setdefault('_current_version', item['version']) diff --git a/newsroom/wire/formatters/ninjs2.py b/newsroom/wire/formatters/ninjs2.py index e76018024..cdb1b0f31 100644 --- a/newsroom/wire/formatters/ninjs2.py +++ b/newsroom/wire/formatters/ninjs2.py @@ -1,6 +1,7 @@ from .ninjs import NINJSFormatter from newsroom.news_api.utils import check_featuremedia_association_permission from newsroom.wire.formatters.utils import remove_internal_renditions +from newsroom.utils import remove_all_embeds class NINJSFormatter2(NINJSFormatter): @@ -18,3 +19,14 @@ def _transform_to_ninjs(self, item): if not item.get('associations'): item.pop('associations', None) return remove_internal_renditions(super()._transform_to_ninjs(item), remove_media=True) + + +class NINJSFormatter3(NINJSFormatter2): + """ + Format with no Embeds + """ + + def _transform_to_ninjs(self, item): + remove_all_embeds(item) + ninjs = super()._transform_to_ninjs(item) + return ninjs diff --git a/newsroom/wire/formatters/nitf.py b/newsroom/wire/formatters/nitf.py index af44987ed..7dd87761a 100644 --- a/newsroom/wire/formatters/nitf.py +++ b/newsroom/wire/formatters/nitf.py @@ -1,6 +1,7 @@ from lxml import etree from superdesk.publish.formatters.nitf_formatter import NITFFormatter as SuperdeskNITFFormatter +from newsroom.utils import remove_all_embeds from .base import BaseFormatter @@ -14,6 +15,7 @@ class NITFFormatter(BaseFormatter): formatter = SuperdeskNITFFormatter() def format_item(self, item, item_type='items'): + remove_all_embeds(item) dest = {} nitf = self.formatter.get_nitf(item, dest, '') return etree.tostring(nitf, xml_declaration=True, pretty_print=True, encoding=self.encoding) diff --git a/newsroom/wire/formatters/text.py b/newsroom/wire/formatters/text.py index eea015991..d7a36d8f2 100644 --- a/newsroom/wire/formatters/text.py +++ b/newsroom/wire/formatters/text.py @@ -1,6 +1,7 @@ import flask from .base import BaseFormatter +from newsroom.utils import remove_all_embeds class TextFormatter(BaseFormatter): @@ -9,6 +10,7 @@ class TextFormatter(BaseFormatter): MIMETYPE = 'text/plain' def format_item(self, item, item_type='items'): + remove_all_embeds(item) if item_type == 'items': return str.encode(flask.render_template('download_item.txt', item=item), 'utf-8') else: diff --git a/newsroom/wire/views.py b/newsroom/wire/views.py index a6031575a..32801e3d2 100644 --- a/newsroom/wire/views.py +++ b/newsroom/wire/views.py @@ -253,7 +253,8 @@ def download(_ids): update_action_list(_ids.split(','), 'downloads', force_insert=True) get_resource_service('history').create_history_record(items, 'download', user, request.args.get('type', 'wire')) - return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True) + return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True, + cache_timeout=0) @blueprint.route('/wire_share', methods=['POST']) From cc0cd8a113131e8954d264f432e961c05200aeda Mon Sep 17 00:00:00 2001 From: marwoodandrew Date: Tue, 14 Nov 2023 10:37:06 +1100 Subject: [PATCH 2/2] Additional options on NINJS item endpoints --- features/news_api_item.feature | 29 +++++++++++++++++++++++++++++ newsroom/utils.py | 29 ++++++++++++++++------------- newsroom/wire/formatters/ninjs.py | 16 ++++++++++++++++ 3 files changed, 61 insertions(+), 13 deletions(-) diff --git a/features/news_api_item.feature b/features/news_api_item.feature index 78709c751..32b0adbb1 100644 --- a/features/news_api_item.feature +++ b/features/news_api_item.feature @@ -234,6 +234,35 @@ Feature: News API Item } ] """ + When we get "/news/item/111?format=NINJSFormatter&no_embeds=true&no_media=1" + Then we get existing resource + """ + { + "guid": "111", + "headline": "headline 1", + "body_html": "

Once upon a time there was

who could swim

" + } + """ + When we get "/news/item/111?format=NINJSFormatter2&no_embeds=true" + Then we get existing resource + """ + { + "guid": "111", + "headline": "headline 1", + "body_html": "

Once upon a time there was

who could swim

\"altSome caption

", + "associations": {"editor_19": {"renditions": {"original": {}}}} + } + """ + When we get "/news/item/111?format=NINJSFormatter2&no_media=true" + Then we get existing resource + """ + { + "guid": "111", + "headline": "headline 1", + "body_html": "

Once upon a time there was

a fish

who could swim

", + "associations": {} + } + """ When we get "/news/item/111?format=NINJSFormatter3" Then we get existing resource """ diff --git a/newsroom/utils.py b/newsroom/utils.py index a842b25ec..a8e430a93 100644 --- a/newsroom/utils.py +++ b/newsroom/utils.py @@ -463,30 +463,33 @@ def update_embeds_in_body(item, update_image=None, update_audio=None, update_vid item['body_html'] = to_string(root_elem, method="html") -def remove_all_embeds(item): +def remove_all_embeds(item, remove_by_class=True, remove_media_embeds=True): """ Remove the all embeds from the body of the article, including any divs with the embed_block attribute :param item: + :param remove_by_class: If true removes any divs that have the embed-block class, should remove such things as + embedded tweets + :param remove_media_embeds: Remove any figure tags if the passed value is true :return: """ - if not item.get("body_html", ""): return - # clean all the embedded figures from the html - blacklist = ["figure"] root_elem = lxml_html.fromstring(item.get("body_html", "")) - cleaner = clean.Cleaner( - add_nofollow=False, - kill_tags=blacklist - ) - cleaned_xhtml = cleaner.clean_html(root_elem) + if remove_by_class: + # all embedded tweets etc should be in a div with the class embeded-block, these are removed + embeds = root_elem.xpath('//div[@class=\'embed-block\']') + for embed in embeds: + embed.getparent().remove(embed) - # all embedded tweets etc should be in a div with the class embeded-block, these are removed - embeds = cleaned_xhtml.xpath('//div[@class=\'embed-block\']') - for embed in embeds: - cleaned_xhtml.remove(embed) + if not remove_media_embeds: + item["body_html"] = to_string(root_elem, encoding="unicode", method='html') + return + + # clean all the embedded figures from the html, it will remove the comments as well + cleaner = clean.Cleaner(add_nofollow=False, kill_tags=["figure"]) + cleaned_xhtml = cleaner.clean_html(root_elem) # remove the associations relating to the embeds kill_keys = [key for key in item.get("associations", {}) if key.startswith("editor_")] diff --git a/newsroom/wire/formatters/ninjs.py b/newsroom/wire/formatters/ninjs.py index 24514e33e..6d723e42b 100644 --- a/newsroom/wire/formatters/ninjs.py +++ b/newsroom/wire/formatters/ninjs.py @@ -1,6 +1,8 @@ +import flask import json from .base import BaseFormatter from superdesk.utils import json_serialize_datetime_objectId +from newsroom.utils import remove_all_embeds class NINJSFormatter(BaseFormatter): @@ -20,7 +22,21 @@ def format_item(self, item, item_type='items'): return json.dumps(ninjs, default=json_serialize_datetime_objectId) + @staticmethod + def test_for_true(value): + """ + Test if the value indicates false + :param value: + :return: + """ + return value.lower() == 'true' or value == '1' + def _transform_to_ninjs(self, item): + no_embeds = flask.request.args.get('no_embeds', default=False, type=self.test_for_true) + no_media = flask.request.args.get('no_media', default=False, type=self.test_for_true) + if no_media or no_embeds: + remove_all_embeds(item, remove_media_embeds=no_media, remove_by_class=no_embeds) + ninjs = { 'guid': item.get('_id'), 'version': str(item.get('version', 1)),