diff --git a/features/news_api_item.feature b/features/news_api_item.feature index 78709c75..32b0adbb 100644 --- a/features/news_api_item.feature +++ b/features/news_api_item.feature @@ -234,6 +234,35 @@ Feature: News API Item } ] """ + When we get "/news/item/111?format=NINJSFormatter&no_embeds=true&no_media=1" + Then we get existing resource + """ + { + "guid": "111", + "headline": "headline 1", + "body_html": "

Once upon a time there was

who could swim

" + } + """ + When we get "/news/item/111?format=NINJSFormatter2&no_embeds=true" + Then we get existing resource + """ + { + "guid": "111", + "headline": "headline 1", + "body_html": "

Once upon a time there was

who could swim

\"altSome caption

", + "associations": {"editor_19": {"renditions": {"original": {}}}} + } + """ + When we get "/news/item/111?format=NINJSFormatter2&no_media=true" + Then we get existing resource + """ + { + "guid": "111", + "headline": "headline 1", + "body_html": "

Once upon a time there was

a fish

who could swim

", + "associations": {} + } + """ When we get "/news/item/111?format=NINJSFormatter3" Then we get existing resource """ diff --git a/newsroom/utils.py b/newsroom/utils.py index a842b25e..a8e430a9 100644 --- a/newsroom/utils.py +++ b/newsroom/utils.py @@ -463,30 +463,33 @@ def update_embeds_in_body(item, update_image=None, update_audio=None, update_vid item['body_html'] = to_string(root_elem, method="html") -def remove_all_embeds(item): +def remove_all_embeds(item, remove_by_class=True, remove_media_embeds=True): """ Remove the all embeds from the body of the article, including any divs with the embed_block attribute :param item: + :param remove_by_class: If true removes any divs that have the embed-block class, should remove such things as + embedded tweets + :param remove_media_embeds: Remove any figure tags if the passed value is true :return: """ - if not item.get("body_html", ""): return - # clean all the embedded figures from the html - blacklist = ["figure"] root_elem = lxml_html.fromstring(item.get("body_html", "")) - cleaner = clean.Cleaner( - add_nofollow=False, - kill_tags=blacklist - ) - cleaned_xhtml = cleaner.clean_html(root_elem) + if remove_by_class: + # all embedded tweets etc should be in a div with the class embeded-block, these are removed + embeds = root_elem.xpath('//div[@class=\'embed-block\']') + for embed in embeds: + embed.getparent().remove(embed) - # all embedded tweets etc should be in a div with the class embeded-block, these are removed - embeds = cleaned_xhtml.xpath('//div[@class=\'embed-block\']') - for embed in embeds: - cleaned_xhtml.remove(embed) + if not remove_media_embeds: + item["body_html"] = to_string(root_elem, encoding="unicode", method='html') + return + + # clean all the embedded figures from the html, it will remove the comments as well + cleaner = clean.Cleaner(add_nofollow=False, kill_tags=["figure"]) + cleaned_xhtml = cleaner.clean_html(root_elem) # remove the associations relating to the embeds kill_keys = [key for key in item.get("associations", {}) if key.startswith("editor_")] diff --git a/newsroom/wire/formatters/ninjs.py b/newsroom/wire/formatters/ninjs.py index 24514e33..6d723e42 100644 --- a/newsroom/wire/formatters/ninjs.py +++ b/newsroom/wire/formatters/ninjs.py @@ -1,6 +1,8 @@ +import flask import json from .base import BaseFormatter from superdesk.utils import json_serialize_datetime_objectId +from newsroom.utils import remove_all_embeds class NINJSFormatter(BaseFormatter): @@ -20,7 +22,21 @@ def format_item(self, item, item_type='items'): return json.dumps(ninjs, default=json_serialize_datetime_objectId) + @staticmethod + def test_for_true(value): + """ + Test if the value indicates false + :param value: + :return: + """ + return value.lower() == 'true' or value == '1' + def _transform_to_ninjs(self, item): + no_embeds = flask.request.args.get('no_embeds', default=False, type=self.test_for_true) + no_media = flask.request.args.get('no_media', default=False, type=self.test_for_true) + if no_media or no_embeds: + remove_all_embeds(item, remove_media_embeds=no_media, remove_by_class=no_embeds) + ninjs = { 'guid': item.get('_id'), 'version': str(item.get('version', 1)),