Skip to content

Commit

Permalink
fix the Belga dpa parser to take better care of the ed note [SDBELGA-…
Browse files Browse the repository at this point in the history
…910] (#661)

* fix the Belga dpa parser to take better care of the ed note [SDBELGA-910]

* create a helper func

* fix black SDBELGA-910

* refactore code via black

* update logic for extracting ednotew

* address comment

* remove unwanted code

* remove unused func
  • Loading branch information
devketanpro authored Dec 13, 2024
1 parent 9ef9c76 commit 7e0ad6b
Show file tree
Hide file tree
Showing 6 changed files with 197 additions and 31 deletions.
10 changes: 7 additions & 3 deletions server/belga/io/feed_parsers/belga_dpa_newsml_2_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,14 @@ def parse_inline_content(self, tree, item):

def parse_item_meta(self, tree, item):
super().parse_item_meta(tree, item)

meta = tree.find(self.qname("itemMeta"))
edNote = meta.find(self.qname("edNote"))
text = ElementTree.tostring(edNote, encoding="utf-8", method="text")
item["ednote"] = text.decode("utf-8").replace(" \n", "").replace(" ", "")
item["ednote"] = "\n".join(
edNote.text.strip()
for edNote in meta.findall(self.qname("edNote"))
if "dpaednoterole:correctionshort" == edNote.attrib.get("role", "")
and edNote.text
)

def parse_content_meta(self, tree, item):
meta = super().parse_content_meta(tree, item)
Expand Down
8 changes: 5 additions & 3 deletions server/belga/planning_exports/format_news_events_tommorow.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,11 @@ def format_event_for_tommorow(
# Format event details
formatted_event = {
"subject": ",".join(get_subjects(event, "fr")),
"calendars": event["calendars"][0]["qcode"].capitalize()
if event.get("calendars")
else "",
"calendars": (
event["calendars"][0]["qcode"].capitalize()
if event.get("calendars")
else ""
),
"contacts": get_formatted_contacts(event),
"coverages": get_coverages(event, locale),
"location": get_item_location(event, locale),
Expand Down
8 changes: 5 additions & 3 deletions server/belga/publish/belga_newsml_1_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,11 @@ def _format_text(self, newscomponent_1_level, item):
newscomponent_2_level,
"Role",
{
"FormalName": item["_role"]
if item.get("profile") in self.SD_CP_NAME_ROLE_MAP
else item.get("_role").split(" -")[0].title()
"FormalName": (
item["_role"]
if item.get("profile") in self.SD_CP_NAME_ROLE_MAP
else item.get("_role").split(" -")[0].title()
)
},
)
# NewsLines
Expand Down
6 changes: 3 additions & 3 deletions server/belga/search_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,9 +542,9 @@ def format_list_item(self, data):
"sign_off": self.get_sign_off(data.get("authors")),
"authors": self.get_authors(data.get("authors")),
"subject": self.get_subjects(data),
"renditions": self.get_renditions(data)
if data.get("assetType") == "Picture"
else {},
"renditions": (
self.get_renditions(data) if data.get("assetType") == "Picture" else {}
),
# SDBELGA-665
"ednote": get_text(data.get("editorialInfo")),
}
Expand Down
25 changes: 6 additions & 19 deletions server/tests/io/feed_parsers/belga_dpa_newsml_2_0_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,25 +43,6 @@ def test_content(self):
self.assertEqual(item["type"], "text")
self.assertEqual(str(item["versioncreated"]), "2019-06-03 13:00:01+00:00")
self.assertEqual(item["pubstatus"], "usable")
expected_ednote = (
"\nNotizblock"
"\nRedaktionelle Hinweise"
"\n Migranten sind nach Definition der Internationalen Organisation ‎für Migration (IOM) alle Menschen,"
" die ihren Wohnort verlassen – egal ‎aus welchen Gründen, wie lange oder ob freiwillig oder "
"‎unfreiwillig. Flüchtlinge dagegen suchen Schutz vor Krieg oder vor ‎drohender Verfolgung, etwa "
"wegen ihrer Religion, Nationalität oder ‎ihrer politischen Überzeugung. Damit sind Flüchtlinge "
"auch Migranten‎, aber nicht alle Migranten Flüchtlinge."
"\n Internet"
"\nKüstenwache Mitteilungen"
"\nOrte"
"\n [Alexandroupolis](Alexandroupolis 681 00, Griechenland)"
"\n[Kleininsel Agathonisi](Agathonisi, Griechenland)"
"\n Die folgenden Informationen sind nicht zur Veröffentlichung bestimmt"
"\nKontakte"
"\n Autor: Takis Tsafos (Athen), +30 6944 33 24 77, <[email protected]>"
"\ndpa tt xx n1\n"
)
self.assertEqual(item["ednote"], expected_ednote)
self.assertEqual(item["urgency"], 3)
self.assertEqual(
item["headline"],
Expand Down Expand Up @@ -184,3 +165,9 @@ def test_new_mappings(self):
expected_subject.sort(key=lambda i: i["name"])
self.assertEqual(item["extra"], {"city": "Berlin", "country": "Germany"})
self.assertEqual(item["genre"], [{"name": "EXTRA"}])

def test_edNote_content(self):
filename = "3FB1C600A1AC5567.xml"
self._initialize_parser(filename)
item = self.item[0]
self.assertEqual(item["ednote"], "updated with a photo")
171 changes: 171 additions & 0 deletions server/tests/io/fixtures/3FB1C600A1AC5567.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
<?xml version="1.0" encoding="UTF-8"?>
<newsMessage xmlns="http://iptc.org/std/nar/2006-10-01/">
<header>
<sent>2024-07-03T13:14:00+02:00</sent>
<priority>3</priority>
<origin>ines</origin>
</header>
<itemSet>
<newsItem xmlns:dpa="http://www.dpa.com/iptc/nar/2008-12-01/" xmlns:h="http://www.w3.org/1999/xhtml" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" conformance="power" guid="urn:newsml:dpa.com:20090101:240703-99-619526" standard="NewsML-G2" standardversion="2.28" version="4" xml:lang="en">
<catalogRef href="http://www.iptc.org/std/catalog/catalog.IPTC-G2-Standards_32.xml"/>
<catalogRef href="http://g2.dpa.com/catalog/catalog001.xml"/>
<rightsInfo>
<copyrightHolder qcode="nprov:dpa">
<name xml:lang="en">Deutsche Presse-Agentur GmbH</name>
<definition xml:lang="en">dpa - Deutsche Presse Agentur GmbH / Hamburg Trade Register, HRB 68431</definition>
</copyrightHolder>
<copyrightNotice xml:lang="en">(c) 2024 dpa Deutsche Presse Agentur GmbH</copyrightNotice>
<usageTerms xml:lang="en">Use only with the written agreement with dpa</usageTerms>
</rightsInfo>
<itemMeta>
<itemClass qcode="ninat:text">
<name xml:lang="en">Text Item(s)</name>
</itemClass>
<provider qcode="nprov:dpa">
<name xml:lang="en">Deutsche Presse-Agentur GmbH</name>
</provider>
<versionCreated>2024-07-03T13:13:54+02:00</versionCreated>
<pubStatus qcode="stat:usable">
<name xml:lang="en">Usable</name>
</pubStatus>
<generator versioninfo="2.1.73">service-ines</generator>
<profile versioninfo="2.0.0">dpa-G2</profile>
<service qcode="dpasrv:eca-mm">
<name role="nrol:full">Englischer Dienst multimedial</name>
</service>
<service qcode="dpasrv:eca">
<name role="nrol:full">Englischer Dienst</name>
</service>
<edNote role="dpaednoterole:closingline">dpa cis aha wjh</edNote>
<edNote pubconstraint="dpapconstraint:nonpublic" role="dpaednoterole:dpacontacts">Reporting by: Ciarán Sunderland and Ansgar Haase in Brussels</edNote>
<edNote pubconstraint="dpapconstraint:nonpublic" role="dpaednoterole:dpacontacts">Editing by: Bill Heaney, +49 30 2852 31472, &lt;[email protected]&gt;</edNote>
<edNote role="dpaednoterole:picture">dpa photos</edNote>
<edNote role="dpaednoterole:correctionshort">updated with a photo</edNote>
<edNote role="dpaednoterole:notepad">
<section xmlns="http://www.w3.org/1999/xhtml" class="notepad">
<header>
<h3>Notebook</h3>
</header>
<section class="np-nonpublic">
<p>The following information is not intended for publication</p>
<h4>Editorial contacts</h4>
<ul>
<li>Reporting by: Ciarán Sunderland and Ansgar Haase in Brussels</li>
<li>Editing by: Bill Heaney, +49 30 2852 31472, &lt;[email protected]&gt;</li>
</ul>
</section>
<p class="closingline">dpa cis aha wjh</p>
</section>
</edNote>
<signal qcode="sig:update">
<name xml:lang="en">Update</name>
</signal>
<link contenttype="application/vnd.iptc.g2.newsitem+xml" rank="1" rel="irel:seeAlso" residref="urn:newsml:dpa.com:20090101:240703-99-619602">
<itemClass qcode="ninat:picture">
<name xml:lang="en">Picture Item(s)</name>
</itemClass>
<title>NATO allies nix multi-year Ukraine aid plan</title>
</link>
</itemMeta>
<contentMeta>
<urgency>3</urgency>
<located>
<name>Brussels</name>
</located>
<creator qcode="dpa-ad:sunderland.ciaran">
<name>sunderland.ciaran</name>
</creator>
<contributor qcode="dpa-ad:sunderland.ciaran">
<name>sunderland.ciaran</name>
</contributor>
<contributor qcode="dpa-ad:heaney.william">
<name>heaney.william</name>
</contributor>
<contributor qcode="dpa-ad:qassem.shorook">
<name>qassem.shorook</name>
</contributor>
<altId environment="dpasrv:eca" type="dpa7901rendition:iptc7901Id">eca:0048:3:i:147:dpa:0772::20240703131400MESZ</altId>
<genre qcode="dpatextgenre:21">
<name role="nrol:display" xml:lang="en">DEVELOPING</name>
</genre>
<subject qcode="dpacountry:181" rank="1" type="cpnat:geoArea">
<name role="nrol:display" xml:lang="en">Ukraine</name>
<sameAs qcode="iso3166-1a3:UKR"/>
<sameAs qcode="wikidata:Q212"/>
<sameAs qcode="iso3166-1a2:UA"/>
<broader qcode="wldreg:r150">
<name xml:lang="en">Europe</name>
</broader>
<broader qcode="dpageosbj:44">
<name role="nrol:display" xml:lang="en">Europe</name>
</broader>
</subject>
<subject qcode="dpacountry:139" rank="2" type="cpnat:geoArea">
<name role="nrol:display" xml:lang="en">Russia</name>
<sameAs qcode="iso3166-1a3:RUS"/>
<sameAs qcode="wikidata:Q159"/>
<sameAs qcode="iso3166-1a2:RU"/>
<broader qcode="wldreg:r150">
<name xml:lang="en">Europe</name>
</broader>
<broader qcode="dpageosbj:44">
<name role="nrol:display" xml:lang="en">Europe</name>
</broader>
</subject>
<subject qcode="dpasubject:114" rank="1" type="dpatype:dpasubject">
<name xml:lang="en">conflict</name>
<broader qcode="medtop:16000000">
<name xml:lang="en">conflict, war and peace</name>
</broader>
</subject>
<keyword rank="1">NATO</keyword>
<subject qcode="dpacat:pl" type="dpatype:category">
<name role="nrol:mnemonic" xml:lang="en">i</name>
</subject>

<headline rank="1">NATO allies nix multi-year Ukraine aid plan </headline>
<dateline>Brussels (dpa) - </dateline>
<creditline>dpa</creditline>
<language tag="en"/>
</contentMeta>
<assert qcode="dpapconstraint:nonpublic">
<name xml:lang="en">non public</name>
</assert>
<contentSet>




<inlineXML contenttype="application/xhtml+xml" wordcount="147">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta charset="utf-8"/>
<title>NATO allies nix multi-year Ukraine aid plan </title>
</head>
<body>
<header>
<time class="publicationDate" data-datetime="2024-07-03T13:13:54+02:00">03.07.2024 11:13 GMT</time>
<ul class="slugline">
<li class="subject" data-qcode="dpasubject:114">conflict</li>
<li class="geo" data-qcode="dpacountry:181">Ukraine</li>
<li class="geo" data-qcode="dpacountry:139">Russia</li>
<li class="keyword">NATO</li>
</ul>
<p class="genre dpatextgenre-21" data-qcode="dpatextgenre:21">DEVELOPING</p>
<h1>NATO allies nix multi-year Ukraine aid plan</h1>
</header>
<section class="main dpatextgenre-21">
<p>
<span class="dateline">Brussels <span class="credit">(dpa)</span> - </span>NATO Secretary General Jens Stoltenberg has failed to get allies to commit to a multi-year financial pledge to support Ukraine, according to information obtained by dpa on Wednesday. </p>
<p>Ahead of a NATO leaders' summit in Washington, allies would only commit to support for Ukraine worth €40 billion ($43 billion) within the next year, dpa learnt from delegations to the alliance. </p>
<p>Allies also did not reach an agreement on sharing the financial costs of supporting Ukraine, with NATO members vaguely stating that the gross domestic product (GDP) of a country's economy should play a role. </p>
<p>At a meeting of NATO defence ministers in June, Stoltenberg called on allies to agree a plan to maintain over the long term their current level of support to Ukraine, which he put at €40 billion per year. </p>
<p>The burden would be divided up according to NATO countries' GDP, with the United States contributing 50%. </p>
</section>
</body>
</html>
</inlineXML>
</contentSet>
</newsItem>
</itemSet>
</newsMessage>

0 comments on commit 7e0ad6b

Please sign in to comment.