From 8d7c6f2a26834f779f31a03ab8438dbb3530a5a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonah=20Br=C3=BCchert?= <jbb@kaidan.im>
Date: Fri, 3 Jan 2025 20:55:50 +0100
Subject: [PATCH] generate-attribution: Try to extract contact information

The idea of having an automated data issue reporting form came up during
38C3.

The extracted list of contact addresses should be consumed as follows:
1. Use publisher address if it exists. The agency might be unaware of
   the feeds existence, and writing to them will not help.
2. Pick the agency with the matching agency_id if it exists.
3. Use a contact address with type "attribution". It should be manually
   checked whether this is applicable.
---
 src/generate-attribution.py | 46 ++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/src/generate-attribution.py b/src/generate-attribution.py
index 2e2fb1f8..19af48c5 100755
--- a/src/generate-attribution.py
+++ b/src/generate-attribution.py
@@ -58,12 +58,9 @@ def http_source_attribution(source: HttpSource) -> Optional[dict]:
         print(f"Info: {feed_path} does not exist, skipping…")
         return None
 
+    contacts: list[dict] = []
+
     with ZipFile(feed_path) as z:
-        with z.open("agency.txt", "r") as a:
-            with io.TextIOWrapper(a) as at:
-                agencyreader = csv.DictReader(at, delimiter=",", quotechar='"')
-                for row in agencyreader:
-                    attribution["operators"].append(row["agency_name"])
         if "feed_info.txt" in z.namelist():
             with z.open("feed_info.txt", "r") as i:
                 with io.TextIOWrapper(i) as it:
@@ -72,6 +69,31 @@ def http_source_attribution(source: HttpSource) -> Optional[dict]:
                     attribution["publisher"] = {}
                     attribution["publisher"]["name"] = publisher["feed_publisher_name"]
                     attribution["publisher"]["url"] = publisher["feed_publisher_url"]
+
+                    contact = {
+                            "type": "publisher",
+                            "name": publisher["feed_publisher_name"],
+                            "email": publisher.get("feed_contact_email"),
+                            "url": publisher.get("feed_contact_url")
+                    }
+
+                    contacts.append(contact)
+
+        with z.open("agency.txt", "r") as a:
+            with io.TextIOWrapper(a) as at:
+                agencyreader = list(csv.DictReader(at, delimiter=",", quotechar='"'))
+
+                attribution["operators"] = \
+                    filter_duplicates(map(lambda agency: agency["agency_name"],
+                                          agencyreader))
+
+                contacts += map(lambda agency: {
+                        "type": "agency",
+                        "agency_id": agency.get("agency_id"),
+                        "name": agency["agency_name"],
+                        "email": agency.get("agency_email")
+                    }, agencyreader)
+
         if "attributions.txt" in z.namelist():
             with z.open("attributions.txt", "r") as a:
                 with io.TextIOWrapper(a) as at:
@@ -80,12 +102,24 @@ def http_source_attribution(source: HttpSource) -> Optional[dict]:
                         map(
                             lambda contrib: {
                                 "name": contrib["organization_name"],
-                                "url": contrib.get("attribution_url"),
+                                "url": contrib.get("attribution_url")
                             },
                             attributionstxt,
                         )
                     )
 
+                    attribution_contacts = map(lambda operator: {
+                            "type": "attribution",
+                            "name": operator["organization_name"],
+                            "email": operator.get("attribution_email")
+                        }, attributionstxt)
+
+                    contacts += attribution_contacts
+
+    attribution["contacts"] = \
+        list(filter(lambda c: c.get("email") or c.get("url"),
+                    contacts))
+
     if (
         "operators" in attribution
         and len(attribution["operators"]) == 1