Skip to content

Commit

Permalink
handle transcripts on publish (#159)
Browse files Browse the repository at this point in the history
- set source based on station subject
- set 90 days expiry for tv/radio transcripts
- populate mediaformat subject if missing
- add script to populate mediaformat for old items
- add sidebar filter for mediaformat

CPCN-49 CPCN-504 CPCN-520
  • Loading branch information
petrjasek authored Jan 11, 2024
1 parent 1ec0853 commit e6413f6
Show file tree
Hide file tree
Showing 7 changed files with 198 additions and 22 deletions.
2 changes: 2 additions & 0 deletions server/cp/commands/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from . import fix_language # noqa
from . import fix_mediaformat # noqa
35 changes: 35 additions & 0 deletions server/cp/commands/fix_mediaformat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import time

from superdesk import get_resource_service
from cp.signals import get_media_type_name, get_media_type_scheme
from newsroom.commands.manager import manager


@manager.command
def fix_mediaformat(resource="items", limit=500, sleep_secs=2):
service = get_resource_service(resource)
media_type_scheme = get_media_type_scheme()
source = {
"query": {
"bool": {"must_not": {"term": {"subject.scheme": media_type_scheme}}}
},
"size": 100,
}
for i in range(int(limit)):
items = service.search(source)
if not items.count():
break
for item in items:
updates = {"subject": item["subject"].copy() if item.get("subject") else []}
updates["subject"].append(
dict(
code="wiretext",
name=get_media_type_name("wiretext", item.get("language")),
scheme=media_type_scheme,
)
)

service.system_update(item["_id"], updates, item)
print(".", end="", flush=True)
time.sleep(int(sleep_secs))
print("done.")
71 changes: 66 additions & 5 deletions server/cp/signals.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,30 @@
from typing import Optional
from newsroom.types import User, Company
import cp

from typing import Literal, Optional
from flask import current_app as app

from datetime import datetime, timedelta
from superdesk import get_resource_service
from newsroom.signals import publish_item, user_created, user_updated, user_deleted, push
from newsroom.types import User, Company
from newsroom.signals import (
publish_item,
user_created,
user_updated,
user_deleted,
push,
)

from cp.cem import send_notification


def fix_language(lang) -> str:
return lang.split('-')[0].split('_')[0].lower()
return lang.split("-")[0].split("_")[0].lower()


def on_publish_item(sender, item, **kwargs):
copy_headline2_to_headline(item)
copy_correction_to_body_html(item)
handle_transcripts(item)


def copy_headline2_to_headline(item):
Expand Down Expand Up @@ -62,13 +72,64 @@ def user_auth_is_gip(user: User) -> bool:
if not user.get("company"):
return False

company: Optional[Company] = get_resource_service("companies").find_one(req=None, _id=user["company"])
company: Optional[Company] = get_resource_service("companies").find_one(
req=None, _id=user["company"]
)
if not company:
return False

return company.get("auth_provider") == "gip"


def handle_transcripts(item):
item.setdefault("subject", [])
media_type_scheme = get_media_type_scheme()
media_type = next(
(s for s in item["subject"] if s.get("scheme") == media_type_scheme), None
)
media_source_scheme = app.config.get("MEDIA_SOURCE_SCHEME", "station")
media_source = next(
(s for s in item["subject"] if s.get("scheme") == media_source_scheme), None
)

if not media_type: #
item["subject"].append(
dict(
name=get_media_type_name("wiretext", item.get("language")),
code="wiretext",
scheme=media_type_scheme,
)
)
return

if "fr" in item.get("language", "en"):
media_type["name"] = get_media_type_name(media_type["code"], item["language"])

if media_source:
item["source"] = media_source["name"]

if media_type and media_type["code"] in ("tvstation", "radionstation"):
# it might be already populated based on previous segment
item.setdefault("expiry", datetime.utcnow() + timedelta(days=90))


MediaType = Literal["radionstation", "tvstation", "wireaudio", "wiretext"]
MEDIA_TYPE_NAMES = {
"wiretext": ("Wire text", "Texte fil de presse"),
"wireaudio": ("Wire audio", "Audio fil de presse"),
"tvstation": ("TV station", "Station de télé"),
"radiostation": ("Radio station", "Station de radio"),
}


def get_media_type_scheme():
return app.config.get("MEDIA_TYPE_CV", "mediaformat")


def get_media_type_name(scheme: MediaType, language: Optional[str] = "en") -> str:
return MEDIA_TYPE_NAMES[scheme][1 if language and "fr" in language else 0]


def init_app(app):
publish_item.connect(on_publish_item)
user_created.connect(on_user_created)
Expand Down
2 changes: 1 addition & 1 deletion server/manage.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from newsroom.commands import * # noqa
from newsroom.commands.manager import manager

from cp.commands.fix_language import fix_language # noqa
import cp.commands # noqa


if __name__ == "__main__":
Expand Down
9 changes: 9 additions & 0 deletions server/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,15 @@
"field": "language",
"label": lazy_gettext("Language"),
},
{
"field": "mediaformat",
"label": lazy_gettext("Media type"),
"nested": {
"parent": "subject",
"field": "scheme",
"value": "mediaformat",
},
},
{
"field": "source",
"label": lazy_gettext("Source"),
Expand Down
23 changes: 23 additions & 0 deletions server/tests/test_commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from cp.commands.fix_mediaformat import fix_mediaformat


def test_fix_mediaformat(app):
app.data.insert(
"items",
[
{"_id": "en", "language": "en", "type": "text"},
{"_id": "fr", "language": "fr", "type": "text"},
],
)

fix_mediaformat()

en_item = app.data.find_one("items", req=None, _id="en")
assert "subject" in en_item
assert 1 == len(en_item["subject"])
assert "wiretext" == en_item["subject"][0]["code"]
assert "Wire text" == en_item["subject"][0]["name"]
assert "mediaformat" == en_item["subject"][0]["scheme"]

fr_item = app.data.find_one("items", req=None, _id="fr")
assert "Texte fil de presse" == fr_item["subject"][0]["name"]
78 changes: 62 additions & 16 deletions server/tests/test_signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,29 @@
import responses
import cp.signals as signals

from datetime import datetime, timedelta
from responses import matchers


def test_on_publish_no_extended_headline():
def test_on_publish_no_extended_headline(app):
item = {"headline": "foo"}
signals.on_publish_item(None, item)
assert item["headline"] == "foo"


def test_on_publish_empty_extended_headline():
def test_on_publish_empty_extended_headline(app):
item = {"headline": "foo", "extra": {cp.HEADLINE2: ""}}
signals.on_publish_item(None, item)
assert item["headline"] == "foo"


def test_on_publish_copy_extended_headline():
def test_on_publish_copy_extended_headline(app):
item = {"headline": "foo", "extra": {cp.HEADLINE2: "bar"}}
signals.on_publish_item(None, item)
assert item["headline"] == "bar"


def test_on_publish_add_correction_to_body_html():
def test_on_publish_add_correction_to_body_html(app):
item = {
"body_html": "<p>some text</p><p>another one</p>",
"extra": {"correction": "correction info"},
Expand All @@ -45,12 +46,17 @@ def test_cem_notification_on_user_changes(app):
}
)
company_id = bson.ObjectId()
app.data.insert("companies", [{
"_id": company_id,
"name": "Example Company",
"is_enabled": True,
"auth_provider": "gip",
}])
app.data.insert(
"companies",
[
{
"_id": company_id,
"name": "Example Company",
"is_enabled": True,
"auth_provider": "gip",
}
],
)
user = {"_id": bson.ObjectId(), "email": "[email protected]", "company": company_id}

with responses.RequestsMock(assert_all_requests_are_fired=True) as rsps:
Expand Down Expand Up @@ -140,12 +146,17 @@ def test_cem_notification_for_non_google_auth(app, mocker):
}
)
company_id = bson.ObjectId()
app.data.insert("companies", [{
"_id": company_id,
"name": "Example Company",
"is_enabled": True,
"auth_provider": "azure",
}])
app.data.insert(
"companies",
[
{
"_id": company_id,
"name": "Example Company",
"is_enabled": True,
"auth_provider": "azure",
}
],
)
user = {"_id": bson.ObjectId(), "email": "[email protected]", "company": company_id}

signals.on_user_created(None, user=user, foo=1)
Expand All @@ -172,3 +183,38 @@ def test_language_agenda():
item["language"] = "fr-ca"
signals.push.send(None, item=item)
assert "fr" == item["language"]


def test_handle_transcripts(app):
text_item = {"source": "CP", "subject": []}
signals.on_publish_item(None, text_item)
assert 1 == len(text_item["subject"])
assert "mediaformat" == text_item["subject"][0]["scheme"]
assert "wiretext" == text_item["subject"][0]["code"]
assert "Wire text" == text_item["subject"][0]["name"]

text_item = {"source": "CP", "subject": [], "language": "fr_CA"}
signals.on_publish_item(None, text_item)
assert "Texte fil de presse" == text_item["subject"][0]["name"]

transcript_item = {
"source": "TVEyes",
"subject": [
{"code": "tvstation", "name": "TV Station", "scheme": "mediaformat"},
{"code": "CITY24", "name": "CP24 (CITY24)", "scheme": "station"},
],
}

signals.on_publish_item(None, transcript_item)
assert "CP24 (CITY24)" == transcript_item["source"]
assert "TV Station" == transcript_item["subject"][0]["name"]
assert "expiry" in transcript_item
assert (
datetime.now()
< transcript_item["expiry"]
< datetime.now() + timedelta(days=100)
)

transcript_item["language"] = "fr-CA"
signals.on_publish_item(None, transcript_item)
assert "Station de télé" == transcript_item["subject"][0]["name"]

0 comments on commit e6413f6

Please sign in to comment.