Skip to content

Commit

Permalink
refactor html formatting for nitf/ninjs (#455)
Browse files Browse the repository at this point in the history
* refactor html formatting for nitf/ninjs

so there is similar logic in both
set ninjs copyrightholder to NTB

SDNTB-811
  • Loading branch information
petrjasek authored Apr 26, 2023
1 parent 35d3efe commit b37e09a
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 93 deletions.
41 changes: 31 additions & 10 deletions server/ntb/publish/ntb_ninjs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import lxml.etree as etree

from flask import g
from typing import Dict, List
from lxml.html import HTMLParser
from superdesk import get_resource_service
from superdesk.etree import clean_html, to_string
from superdesk.publish.formatters.ninjs_formatter import NINJSFormatter
from superdesk.text_utils import get_char_count, get_word_count

from . import utils

Expand Down Expand Up @@ -58,15 +63,18 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True):
if ninjs.get("description_text"):
ninjs["descriptions"] = self.format_descriptions(ninjs)

if ninjs.get("body_html"):
ninjs["bodies"] = self.format_bodies(ninjs)
if article.get("body_html"):
ninjs["bodies"] = self.format_bodies(article)

if ninjs.get("subject"):
ninjs["subjects"] = self.format_subjects(ninjs)

if ninjs.get("place"):
ninjs["places"] = ninjs["place"]

if ninjs.get("guid"):
ninjs.setdefault("uri", ninjs["guid"])

# removed items which mapped according to Ninjs v2 properties
ninjs_properties = [
"headlines",
Expand All @@ -84,7 +92,6 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True):
"copyrightnotice",
"usageterms",
"ednote",
"guid",
"language",
"descriptions",
"bodies",
Expand All @@ -99,14 +106,14 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True):
"by",
"slugline",
"located",
"renditions",
"associations",
"altids",
"trustindicators",
"standard",
"genre",
"rightsinfo",
"service",
"infosources",
]

for key in list(ninjs.keys()):
Expand All @@ -131,9 +138,16 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True):
for tagline in article["sign_off"].split("/"):
ninjs["taglines"].append(tagline.strip())

if article.get("type") == "text":
ninjs["infosources"] = [
{"name": utils.get_distributor(article)},
]

if recursive: # should only run at the end, so do this on top level item only
convert_dicts_to_lists(ninjs)

ninjs["copyrightholder"] = "NTB"

return ninjs

def _format_place(self, article) -> List[Dict]:
Expand All @@ -157,7 +171,6 @@ def _format_place(self, article) -> List[Dict]:
return places

def _format_rendition(self, rendition):
print("IN", rendition)
formatted = super()._format_rendition(rendition)
if formatted.get("mimetype"):
formatted["contenttype"] = formatted.pop("mimetype")
Expand Down Expand Up @@ -189,13 +202,21 @@ def format_headlines(self, article):
def format_descriptions(self, ninjs):
return [{"value": ninjs.get("description_text"), "contenttype": "text/plain"}]

def format_bodies(self, ninjs):
def format_bodies(self, article):
html, _ = utils.format_body_content(article)
parser = HTMLParser(recover=True, remove_blank_text=True)
try:
html_tree = etree.fromstring(html, parser)
except Exception as e:
raise ValueError("Can't parse body_html content: {}".format(e))
html_tree_clean = clean_html(html_tree)
html = to_string(html_tree_clean, method="html", remove_root_div=True)
return [
{
"charcount": ninjs.get("charcount"),
"wordcount": ninjs.get("wordcount"),
"value": ninjs.get("body_html"),
"contenttype": "text/plain",
"charcount": get_char_count(html),
"wordcount": get_word_count(html),
"value": html,
"contenttype": "text/html",
}
]

Expand Down
73 changes: 6 additions & 67 deletions server/ntb/publish/ntb_nitf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,30 +24,19 @@
from superdesk.publish.formatters.nitf_formatter import NITFFormatter, EraseElement
from superdesk.publish.publish_service import PublishService
from superdesk.errors import FormatterError
from superdesk.cache import cache
from superdesk.text_utils import get_text

from . import utils

logger = logging.getLogger(__name__)
tz = None

EMBED_RE = re.compile(
r"<!-- EMBED START ([a-zA-Z]+ {id: \"(?P<id>.+?)\"}) -->.*"
r"<!-- EMBED END \1 -->",
re.DOTALL,
)

FILENAME_FORBIDDEN_RE = re.compile(r"[^a-zA-Z0-9._-]")
STRIP_INVALID_CHARS_RE = re.compile("[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]")
ENCODING = "iso-8859-1"
LANGUAGE = "nb-NO" # default language for ntb
assert ENCODING != "unicode" # use e.g. utf-8 for unicode


def _get_language(article):
return article.get("language") or LANGUAGE


def get_content_field(article, field):
content_type = get_resource_service("content_types").find_one(
req=None, _id=article["profile"]
Expand Down Expand Up @@ -80,19 +69,14 @@ def can_format(self, format_type, article):
format_type == self.FORMAT_TYPE and article[ITEM_TYPE] == CONTENT_TYPE.TEXT
)

def strip_invalid_chars(self, string):
if string is None:
string = ""
return STRIP_INVALID_CHARS_RE.sub("", string)

def format(self, original_article, subscriber, codes=None, encoding="us-ascii"):
article = deepcopy(original_article)
self._populate_metadata(article)
global tz
if tz is None:
# first time this method is launched
# we set timezone and NTB specific filter
tz = pytz.timezone(superdesk.app.config["DEFAULT_TIMEZONE"])
tz = pytz.timezone(app.config["DEFAULT_TIMEZONE"])
try:
if article.get("body_html"):
article["body_html"] = article["body_html"].replace("<br>", "<br />")
Expand All @@ -101,7 +85,7 @@ def format(self, original_article, subscriber, codes=None, encoding="us-ascii"):
)
nitf = self.get_nitf(article, subscriber, pub_seq_num)
try:
nitf.attrib["baselang"] = _get_language(article)
nitf.attrib["baselang"] = utils.get_language(article)
except KeyError:
pass

Expand Down Expand Up @@ -448,11 +432,7 @@ def _format_body_head_dateline(self, article, body_head):
def _format_body_head_distributor(self, article, body_head):
distrib = etree.SubElement(body_head, "distributor")
org = etree.SubElement(distrib, "org")
language = _get_language(article)
if language == "nb-NO":
org.text = "NTB"
elif language == "nn-NO":
org.text = "NPK"
org.text = utils.get_distributor(article)

def _add_media(
self,
Expand Down Expand Up @@ -503,48 +483,7 @@ def _format_body_content(self, article, body_content):
abstract_txt = etree.tostring(abstract, encoding="unicode", method="text")
p.text = abstract_txt

# media
media_data = []
try:
associations = article["associations"]
except KeyError:
pass
else:
feature_image = associations.get("featureimage")
if feature_image is not None:
feature_image["_featured"] = "image"
media_data.append(feature_image)
else:
feature_media = associations.get("featuremedia")
if feature_media is not None:
feature_media["_featured"] = "media"
media_data.append(feature_media)

def repl_embedded(match):
"""Embedded in body_html handling"""
# this method do 2 important things:
# - it remove the embedded from body_html
# - it fill media_data with embedded data in order of appearance
id_ = match.group("id")
try:
data = associations[id_]
except KeyError:
logger.warning("Expected association {} not found!".format(id_))
else:
if data is None:
logger.warning(
"media data for association {} is empty, ignoring!".format(id_)
)
else:
media_data.append(data)
return ""

html = self.strip_invalid_chars(
EMBED_RE.sub(repl_embedded, article.get("body_html") or "")
)
# it is a request from SDNTB-388 to use normal space instead of non breaking spaces
# so we do this replace
html = html.replace("&nbsp;", " ")
html, media_data = utils.format_body_content(article)

# at this point we have media data filled in right order
# and no more embedded in html
Expand Down Expand Up @@ -625,7 +564,7 @@ def repl_embedded(match):
if type_ == "image" or type_ == "grafikk"
else "video/mpeg"
)
caption = self.strip_invalid_chars(data.get("description_text"))
caption = utils.strip_invalid_chars(data.get("description_text"))
self._add_media(body_content, type_, mime_type, source, caption, featured)
media_counter = len(media_data)

Expand Down
86 changes: 86 additions & 0 deletions server/ntb/publish/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,29 @@

import re
import logging
import superdesk.etree as sd_etree

from lxml import etree
from typing import Dict, List, Optional, Tuple


LANGUAGE = "nb-NO" # default language for ntb

EMBED_RE = re.compile(
r"<!-- EMBED START ([a-zA-Z]+ {id: \"(?P<id>.+?)\"}) -->.*"
r"<!-- EMBED END \1 -->",
re.DOTALL,
)

STRIP_INVALID_CHARS_RE = re.compile("[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]")

logger = logging.getLogger(__name__)


def get_language(article) -> str:
return article.get("language") or LANGUAGE


def get_rewrite_sequence(article) -> int:
return int(article.get("rewrite_sequence") or 0)

Expand All @@ -12,3 +37,64 @@ def get_doc_id(article) -> str:
ntb_id=get_ntb_id(article),
version=get_rewrite_sequence(article),
)


def get_distributor(article) -> str:
language = get_language(article)
if language == "nn-NO":
return "NPK"
return "NTB"


def strip_invalid_chars(string: Optional[str]) -> str:
if not string:
return ""
return STRIP_INVALID_CHARS_RE.sub("", string)


def format_body_content(article) -> Tuple[str, List[Dict]]:
# media
media_data = []
try:
associations = article["associations"]
except KeyError:
pass
else:
feature_image = associations.get("featureimage")
if feature_image is not None:
feature_image["_featured"] = "image"
media_data.append(feature_image)
else:
feature_media = associations.get("featuremedia")
if feature_media is not None:
feature_media["_featured"] = "media"
media_data.append(feature_media)

def repl_embedded(match):
"""Embedded in body_html handling"""
# this method do 2 important things:
# - it remove the embedded from body_html
# - it fill media_data with embedded data in order of appearance
id_ = match.group("id")
try:
data = associations[id_]
except KeyError:
logger.warning("Expected association {} not found!".format(id_))
else:
if data is None:
logger.warning(
"media data for association {} is empty, ignoring!".format(id_)
)
else:
media_data.append(data)
return ""

html = strip_invalid_chars(
EMBED_RE.sub(repl_embedded, article.get("body_html") or "")
)

# it is a request from SDNTB-388 to use normal space instead of non breaking spaces
# so we do this replace
html = html.replace("&nbsp;", " ")

return html, media_data
Loading

0 comments on commit b37e09a

Please sign in to comment.