From 8d7c6f2a26834f779f31a03ab8438dbb3530a5a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonah=20Br=C3=BCchert?= Date: Fri, 3 Jan 2025 20:55:50 +0100 Subject: [PATCH] generate-attribution: Try to extract contact information The idea of having an automated data issue reporting form came up during 38C3. The extracted list of contact addresses should be consumed as follows: 1. Use publisher address if it exists. The agency might be unaware of the feeds existence, and writing to them will not help. 2. Pick the agency with the matching agency_id if it exists. 3. Use a contact address with type "attribution". It should be manually checked whether this is applicable. --- src/generate-attribution.py | 46 ++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/src/generate-attribution.py b/src/generate-attribution.py index 2e2fb1f8..19af48c5 100755 --- a/src/generate-attribution.py +++ b/src/generate-attribution.py @@ -58,12 +58,9 @@ def http_source_attribution(source: HttpSource) -> Optional[dict]: print(f"Info: {feed_path} does not exist, skipping…") return None + contacts: list[dict] = [] + with ZipFile(feed_path) as z: - with z.open("agency.txt", "r") as a: - with io.TextIOWrapper(a) as at: - agencyreader = csv.DictReader(at, delimiter=",", quotechar='"') - for row in agencyreader: - attribution["operators"].append(row["agency_name"]) if "feed_info.txt" in z.namelist(): with z.open("feed_info.txt", "r") as i: with io.TextIOWrapper(i) as it: @@ -72,6 +69,31 @@ def http_source_attribution(source: HttpSource) -> Optional[dict]: attribution["publisher"] = {} attribution["publisher"]["name"] = publisher["feed_publisher_name"] attribution["publisher"]["url"] = publisher["feed_publisher_url"] + + contact = { + "type": "publisher", + "name": publisher["feed_publisher_name"], + "email": publisher.get("feed_contact_email"), + "url": publisher.get("feed_contact_url") + } + + contacts.append(contact) + + with z.open("agency.txt", "r") as a: + with io.TextIOWrapper(a) as at: + agencyreader = list(csv.DictReader(at, delimiter=",", quotechar='"')) + + attribution["operators"] = \ + filter_duplicates(map(lambda agency: agency["agency_name"], + agencyreader)) + + contacts += map(lambda agency: { + "type": "agency", + "agency_id": agency.get("agency_id"), + "name": agency["agency_name"], + "email": agency.get("agency_email") + }, agencyreader) + if "attributions.txt" in z.namelist(): with z.open("attributions.txt", "r") as a: with io.TextIOWrapper(a) as at: @@ -80,12 +102,24 @@ def http_source_attribution(source: HttpSource) -> Optional[dict]: map( lambda contrib: { "name": contrib["organization_name"], - "url": contrib.get("attribution_url"), + "url": contrib.get("attribution_url") }, attributionstxt, ) ) + attribution_contacts = map(lambda operator: { + "type": "attribution", + "name": operator["organization_name"], + "email": operator.get("attribution_email") + }, attributionstxt) + + contacts += attribution_contacts + + attribution["contacts"] = \ + list(filter(lambda c: c.get("email") or c.get("url"), + contacts)) + if ( "operators" in attribution and len(attribution["operators"]) == 1