handle transcripts on publish (#159)

- set source based on station subject - set 90 days expiry for tv/radio transcripts - populate mediaformat subject if missing - add script to populate mediaformat for old items - add sidebar filter for mediaformat CPCN-49 CPCN-504 CPCN-520
superdesk · Jan 11, 2024 · e6413f6 · e6413f6
1 parent 1ec0853
commit e6413f6
Show file tree

Hide file tree

Showing 7 changed files with 198 additions and 22 deletions.
diff --git a/server/cp/commands/__init__.py b/server/cp/commands/__init__.py
@@ -0,0 +1,2 @@
+from . import fix_language  # noqa
+from . import fix_mediaformat  # noqa
diff --git a/server/cp/commands/fix_mediaformat.py b/server/cp/commands/fix_mediaformat.py
@@ -0,0 +1,35 @@
+import time
+
+from superdesk import get_resource_service
+from cp.signals import get_media_type_name, get_media_type_scheme
+from newsroom.commands.manager import manager
+
+
+@manager.command
+def fix_mediaformat(resource="items", limit=500, sleep_secs=2):
+    service = get_resource_service(resource)
+    media_type_scheme = get_media_type_scheme()
+    source = {
+        "query": {
+            "bool": {"must_not": {"term": {"subject.scheme": media_type_scheme}}}
+        },
+        "size": 100,
+    }
+    for i in range(int(limit)):
+        items = service.search(source)
+        if not items.count():
+            break
+        for item in items:
+            updates = {"subject": item["subject"].copy() if item.get("subject") else []}
+            updates["subject"].append(
+                dict(
+                    code="wiretext",
+                    name=get_media_type_name("wiretext", item.get("language")),
+                    scheme=media_type_scheme,
+                )
+            )
+
+            service.system_update(item["_id"], updates, item)
+        print(".", end="", flush=True)
+        time.sleep(int(sleep_secs))
+    print("done.")
diff --git a/server/cp/signals.py b/server/cp/signals.py
@@ -1,20 +1,30 @@
-from typing import Optional
-from newsroom.types import User, Company
 import cp
 
+from typing import Literal, Optional
+from flask import current_app as app
+
+from datetime import datetime, timedelta
 from superdesk import get_resource_service
-from newsroom.signals import publish_item, user_created, user_updated, user_deleted, push
+from newsroom.types import User, Company
+from newsroom.signals import (
+    publish_item,
+    user_created,
+    user_updated,
+    user_deleted,
+    push,
+)
 
 from cp.cem import send_notification
 
 
 def fix_language(lang) -> str:
-    return lang.split('-')[0].split('_')[0].lower()
+    return lang.split("-")[0].split("_")[0].lower()
 
 
 def on_publish_item(sender, item, **kwargs):
     copy_headline2_to_headline(item)
     copy_correction_to_body_html(item)
+    handle_transcripts(item)
 
 
 def copy_headline2_to_headline(item):
@@ -62,13 +72,64 @@ def user_auth_is_gip(user: User) -> bool:
     if not user.get("company"):
         return False
 
-    company: Optional[Company] = get_resource_service("companies").find_one(req=None, _id=user["company"])
+    company: Optional[Company] = get_resource_service("companies").find_one(
+        req=None, _id=user["company"]
+    )
     if not company:
         return False
 
     return company.get("auth_provider") == "gip"
 
 
+def handle_transcripts(item):
+    item.setdefault("subject", [])
+    media_type_scheme = get_media_type_scheme()
+    media_type = next(
+        (s for s in item["subject"] if s.get("scheme") == media_type_scheme), None
+    )
+    media_source_scheme = app.config.get("MEDIA_SOURCE_SCHEME", "station")
+    media_source = next(
+        (s for s in item["subject"] if s.get("scheme") == media_source_scheme), None
+    )
+
+    if not media_type:  #
+        item["subject"].append(
+            dict(
+                name=get_media_type_name("wiretext", item.get("language")),
+                code="wiretext",
+                scheme=media_type_scheme,
+            )
+        )
+        return
+
+    if "fr" in item.get("language", "en"):
+        media_type["name"] = get_media_type_name(media_type["code"], item["language"])
+
+    if media_source:
+        item["source"] = media_source["name"]
+
+    if media_type and media_type["code"] in ("tvstation", "radionstation"):
+        # it might be already populated based on previous segment
+        item.setdefault("expiry", datetime.utcnow() + timedelta(days=90))
+
+
+MediaType = Literal["radionstation", "tvstation", "wireaudio", "wiretext"]
+MEDIA_TYPE_NAMES = {
+    "wiretext": ("Wire text", "Texte fil de presse"),
+    "wireaudio": ("Wire audio", "Audio fil de presse"),
+    "tvstation": ("TV station", "Station de télé"),
+    "radiostation": ("Radio station", "Station de radio"),
+}
+
+
+def get_media_type_scheme():
+    return app.config.get("MEDIA_TYPE_CV", "mediaformat")
+
+
+def get_media_type_name(scheme: MediaType, language: Optional[str] = "en") -> str:
+    return MEDIA_TYPE_NAMES[scheme][1 if language and "fr" in language else 0]
+
+
 def init_app(app):
     publish_item.connect(on_publish_item)
     user_created.connect(on_user_created)

diff --git a/server/manage.py b/server/manage.py
@@ -1,7 +1,7 @@
 from newsroom.commands import * # noqa
 from newsroom.commands.manager import manager
 
-from cp.commands.fix_language import fix_language  # noqa
+import cp.commands  # noqa
 
 
 if __name__ == "__main__":

diff --git a/server/settings.py b/server/settings.py
@@ -143,6 +143,15 @@
         "field": "language",
         "label": lazy_gettext("Language"),
     },
+    {
+        "field": "mediaformat",
+        "label": lazy_gettext("Media type"),
+        "nested": {
+            "parent": "subject",
+            "field": "scheme",
+            "value": "mediaformat",
+        },
+    },
     {
         "field": "source",
         "label": lazy_gettext("Source"),

diff --git a/server/tests/test_commands.py b/server/tests/test_commands.py
@@ -0,0 +1,23 @@
+from cp.commands.fix_mediaformat import fix_mediaformat
+
+
+def test_fix_mediaformat(app):
+    app.data.insert(
+        "items",
+        [
+            {"_id": "en", "language": "en", "type": "text"},
+            {"_id": "fr", "language": "fr", "type": "text"},
+        ],
+    )
+
+    fix_mediaformat()
+
+    en_item = app.data.find_one("items", req=None, _id="en")
+    assert "subject" in en_item
+    assert 1 == len(en_item["subject"])
+    assert "wiretext" == en_item["subject"][0]["code"]
+    assert "Wire text" == en_item["subject"][0]["name"]
+    assert "mediaformat" == en_item["subject"][0]["scheme"]
+
+    fr_item = app.data.find_one("items", req=None, _id="fr")
+    assert "Texte fil de presse" == fr_item["subject"][0]["name"]
diff --git a/server/tests/test_signals.py b/server/tests/test_signals.py
@@ -3,28 +3,29 @@
 import responses
 import cp.signals as signals
 
+from datetime import datetime, timedelta
 from responses import matchers
 
 
-def test_on_publish_no_extended_headline():
+def test_on_publish_no_extended_headline(app):
     item = {"headline": "foo"}
     signals.on_publish_item(None, item)
     assert item["headline"] == "foo"
 
 
-def test_on_publish_empty_extended_headline():
+def test_on_publish_empty_extended_headline(app):
     item = {"headline": "foo", "extra": {cp.HEADLINE2: ""}}
     signals.on_publish_item(None, item)
     assert item["headline"] == "foo"
 
 
-def test_on_publish_copy_extended_headline():
+def test_on_publish_copy_extended_headline(app):
     item = {"headline": "foo", "extra": {cp.HEADLINE2: "bar"}}
     signals.on_publish_item(None, item)
     assert item["headline"] == "bar"
 
 
-def test_on_publish_add_correction_to_body_html():
+def test_on_publish_add_correction_to_body_html(app):
     item = {
         "body_html": "<p>some text</p><p>another one</p>",
         "extra": {"correction": "correction info"},
@@ -45,12 +46,17 @@ def test_cem_notification_on_user_changes(app):
         }
     )
     company_id = bson.ObjectId()
-    app.data.insert("companies", [{
-        "_id": company_id,
-        "name": "Example Company",
-        "is_enabled": True,
-        "auth_provider": "gip",
-    }])
+    app.data.insert(
+        "companies",
+        [
+            {
+                "_id": company_id,
+                "name": "Example Company",
+                "is_enabled": True,
+                "auth_provider": "gip",
+            }
+        ],
+    )
     user = {"_id": bson.ObjectId(), "email": "[email protected]", "company": company_id}
 
     with responses.RequestsMock(assert_all_requests_are_fired=True) as rsps:
@@ -140,12 +146,17 @@ def test_cem_notification_for_non_google_auth(app, mocker):
         }
     )
     company_id = bson.ObjectId()
-    app.data.insert("companies", [{
-        "_id": company_id,
-        "name": "Example Company",
-        "is_enabled": True,
-        "auth_provider": "azure",
-    }])
+    app.data.insert(
+        "companies",
+        [
+            {
+                "_id": company_id,
+                "name": "Example Company",
+                "is_enabled": True,
+                "auth_provider": "azure",
+            }
+        ],
+    )
     user = {"_id": bson.ObjectId(), "email": "[email protected]", "company": company_id}
 
     signals.on_user_created(None, user=user, foo=1)
@@ -172,3 +183,38 @@ def test_language_agenda():
     item["language"] = "fr-ca"
     signals.push.send(None, item=item)
     assert "fr" == item["language"]
+
+
+def test_handle_transcripts(app):
+    text_item = {"source": "CP", "subject": []}
+    signals.on_publish_item(None, text_item)
+    assert 1 == len(text_item["subject"])
+    assert "mediaformat" == text_item["subject"][0]["scheme"]
+    assert "wiretext" == text_item["subject"][0]["code"]
+    assert "Wire text" == text_item["subject"][0]["name"]
+
+    text_item = {"source": "CP", "subject": [], "language": "fr_CA"}
+    signals.on_publish_item(None, text_item)
+    assert "Texte fil de presse" == text_item["subject"][0]["name"]
+
+    transcript_item = {
+        "source": "TVEyes",
+        "subject": [
+            {"code": "tvstation", "name": "TV Station", "scheme": "mediaformat"},
+            {"code": "CITY24", "name": "CP24 (CITY24)", "scheme": "station"},
+        ],
+    }
+
+    signals.on_publish_item(None, transcript_item)
+    assert "CP24 (CITY24)" == transcript_item["source"]
+    assert "TV Station" == transcript_item["subject"][0]["name"]
+    assert "expiry" in transcript_item
+    assert (
+        datetime.now()
+        < transcript_item["expiry"]
+        < datetime.now() + timedelta(days=100)
+    )
+
+    transcript_item["language"] = "fr-CA"
+    signals.on_publish_item(None, transcript_item)
+    assert "Station de télé" == transcript_item["subject"][0]["name"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from . import fix_language # noqa
		from . import fix_mediaformat # noqa