generate-attribution: Try to extract contact information

The idea of having an automated data issue reporting form came up during 38C3. The extracted list of contact addresses should be consumed as follows: 1. Use publisher address if it exists. The agency might be unaware of the feeds existence, and writing to them will not help. 2. Pick the agency with the matching agency_id if it exists. 3. Use a contact address with type "attribution". It should be manually checked whether this is applicable.
public-transport · Jan 3, 2025 · 8d7c6f2 · 8d7c6f2
1 parent 47dcb7a
commit 8d7c6f2
Showing 1 changed file with 40 additions and 6 deletions.
diff --git a/src/generate-attribution.py b/src/generate-attribution.py
@@ -58,12 +58,9 @@ def http_source_attribution(source: HttpSource) -> Optional[dict]:
         print(f"Info: {feed_path} does not exist, skipping…")
         return None
 
+    contacts: list[dict] = []
+
     with ZipFile(feed_path) as z:
-        with z.open("agency.txt", "r") as a:
-            with io.TextIOWrapper(a) as at:
-                agencyreader = csv.DictReader(at, delimiter=",", quotechar='"')
-                for row in agencyreader:
-                    attribution["operators"].append(row["agency_name"])
         if "feed_info.txt" in z.namelist():
             with z.open("feed_info.txt", "r") as i:
                 with io.TextIOWrapper(i) as it:
@@ -72,6 +69,31 @@ def http_source_attribution(source: HttpSource) -> Optional[dict]:
                     attribution["publisher"] = {}
                     attribution["publisher"]["name"] = publisher["feed_publisher_name"]
                     attribution["publisher"]["url"] = publisher["feed_publisher_url"]
+
+                    contact = {
+                            "type": "publisher",
+                            "name": publisher["feed_publisher_name"],
+                            "email": publisher.get("feed_contact_email"),
+                            "url": publisher.get("feed_contact_url")
+                    }
+
+                    contacts.append(contact)
+
+        with z.open("agency.txt", "r") as a:
+            with io.TextIOWrapper(a) as at:
+                agencyreader = list(csv.DictReader(at, delimiter=",", quotechar='"'))
+
+                attribution["operators"] = \
+                    filter_duplicates(map(lambda agency: agency["agency_name"],
+                                          agencyreader))
+
+                contacts += map(lambda agency: {
+                        "type": "agency",
+                        "agency_id": agency.get("agency_id"),
+                        "name": agency["agency_name"],
+                        "email": agency.get("agency_email")
+                    }, agencyreader)
+
         if "attributions.txt" in z.namelist():
             with z.open("attributions.txt", "r") as a:
                 with io.TextIOWrapper(a) as at:
@@ -80,12 +102,24 @@ def http_source_attribution(source: HttpSource) -> Optional[dict]:
                         map(
                             lambda contrib: {
                                 "name": contrib["organization_name"],
-                                "url": contrib.get("attribution_url"),
+                                "url": contrib.get("attribution_url")
                             },
                             attributionstxt,
                         )
                     )
 
+                    attribution_contacts = map(lambda operator: {
+                            "type": "attribution",
+                            "name": operator["organization_name"],
+                            "email": operator.get("attribution_email")
+                        }, attributionstxt)
+
+                    contacts += attribution_contacts
+
+    attribution["contacts"] = \
+        list(filter(lambda c: c.get("email") or c.get("url"),
+                    contacts))
+
     if (
         "operators" in attribution
         and len(attribution["operators"]) == 1