From a17bbba3b76c89a014fd747cced96a6e130f92ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Ja=C5=A1ek?= Date: Wed, 7 Feb 2024 10:56:41 +0100 Subject: [PATCH] fix transcripts parser (#192) * fix linking to segment 1 it has version 0 and we were only checking 1+ CPCN-598 * avoid html conversion for transcripts CPCN-607 * revert test change * avoid any links in transcripts --- server/cp/ingest/parser/cp_transcripts.py | 29 ++++++++++++------- server/tests/ingest/parser/cp_transcripts.py | 10 +++---- .../cp_transcripts/cp_transcripts.json | 2 +- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/server/cp/ingest/parser/cp_transcripts.py b/server/cp/ingest/parser/cp_transcripts.py index 8a4af0e8..c5b73a50 100644 --- a/server/cp/ingest/parser/cp_transcripts.py +++ b/server/cp/ingest/parser/cp_transcripts.py @@ -2,13 +2,16 @@ from superdesk import get_resource_service from superdesk.io.feed_parsers.ninjs import NINJSFeedParser -from superdesk.text_utils import plain_text_to_html -def get_previous_version(original_ingest_id: str, version_number: int) -> Optional[Dict[str, Any]]: - while version_number > 0: +def get_previous_version( + original_ingest_id: str, version_number: int +) -> Optional[Dict[str, Any]]: + while version_number >= 0: ingest_id = f"{original_ingest_id}.{version_number}" - prev_item = get_resource_service("archive").find_one(req=None, ingest_id=ingest_id) + prev_item = get_resource_service("archive").find_one( + req=None, ingest_id=ingest_id + ) if prev_item is not None: return prev_item @@ -27,12 +30,18 @@ def _transform_from_ninjs(self, ninjs: Dict[str, Any]): ninjs["guid"] = f"{original_guid}.{version}" item = super()._transform_from_ninjs(ninjs) item["version"] = version - item["body_html"] = plain_text_to_html(item["body_html"]) - item.setdefault("extra", {}).update(dict( - publish_ingest_id_as_guid=True, - cp_version=version, - type="transcript", - )) + item["body_html"] = ( + item["body_html"] + if item["body_html"].strip().startswith("

") + else "

{}

".format(item["body_html"]) + ) + item.setdefault("extra", {}).update( + dict( + publish_ingest_id_as_guid=True, + cp_version=version, + type="transcript", + ) + ) previous_item = get_previous_version(original_guid, version - 1) if previous_item is not None: diff --git a/server/tests/ingest/parser/cp_transcripts.py b/server/tests/ingest/parser/cp_transcripts.py index 1b593c95..7a322cdd 100644 --- a/server/tests/ingest/parser/cp_transcripts.py +++ b/server/tests/ingest/parser/cp_transcripts.py @@ -20,7 +20,7 @@ class CP_Transcripts_ParseTestCase(unittest.TestCase): def test_parse(self): with self.app.app_context(), patch.dict(superdesk.resources, resources): superdesk.resources["archive"].service.find_one.side_effect = [ - {"ingest_id": "d3c8487a-1757-4dde-8bb5-22ca166c1e67.1", "version": 2, "extra": {"ap_version": 999}}, + {"ingest_id": "d3c8487a-1757-4dde-8bb5-22ca166c1e67.0", "version": 0, "extra": {"ap_version": 999}}, ] items = parser.parse(get_fixture_path("cp_transcripts.json", "cp_transcripts"), provider) superdesk.resources["archive"].service.find_one.side_effect = None @@ -29,8 +29,8 @@ def test_parse(self): self.assertEqual("text", item["type"]) self.assertEqual("transcript", item["extra"]["type"]) self.assertEqual(True, item["extra"]["publish_ingest_id_as_guid"]) - self.assertEqual(2, item["extra"]["cp_version"]) - self.assertEqual("d3c8487a-1757-4dde-8bb5-22ca166c1e67.2", item["guid"]) - self.assertEqual(2, item["version"]) - self.assertEqual("d3c8487a-1757-4dde-8bb5-22ca166c1e67.1", item["rewrite_of"]) + self.assertEqual(1, item["extra"]["cp_version"]) + self.assertEqual("d3c8487a-1757-4dde-8bb5-22ca166c1e67.1", item["guid"]) + self.assertEqual(1, item["version"]) + self.assertEqual("d3c8487a-1757-4dde-8bb5-22ca166c1e67.0", item["rewrite_of"]) self.assertTrue(item["body_html"].startswith("

laying around")) diff --git a/server/tests/ingest/parser/fixtures/cp_transcripts/cp_transcripts.json b/server/tests/ingest/parser/fixtures/cp_transcripts/cp_transcripts.json index 6acd4258..39c01126 100644 --- a/server/tests/ingest/parser/fixtures/cp_transcripts/cp_transcripts.json +++ b/server/tests/ingest/parser/fixtures/cp_transcripts/cp_transcripts.json @@ -1,6 +1,6 @@ { "guid": "d3c8487a-1757-4dde-8bb5-22ca166c1e67", - "version": "2", + "version": "1", "type": "text", "located": "Toronto, ON", "language": "en-CA",