From 9420e460161da13d7e3cfd801495c6b009d02a8b Mon Sep 17 00:00:00 2001
From: Louis Fisher <louis.fisher1919@gmail.com>
Date: Fri, 24 May 2024 15:04:59 +0100
Subject: [PATCH] get dataset urls from the bsa open data API instead of
 scraping them

---
 .../management/commands/import_scmd.py        | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/openprescribing/pipeline/management/commands/import_scmd.py b/openprescribing/pipeline/management/commands/import_scmd.py
index 180c753280..383da750b4 100644
--- a/openprescribing/pipeline/management/commands/import_scmd.py
+++ b/openprescribing/pipeline/management/commands/import_scmd.py
@@ -1,8 +1,8 @@
 import csv
 import tempfile
+import re
 
 import requests
-from bs4 import BeautifulSoup
 from django.core.management import BaseCommand
 from gcutils.bigquery import Client, build_schema
 from google.cloud.exceptions import Conflict
@@ -98,24 +98,27 @@ def handle(self, *args, **kwargs):
                 print("{} | Ingested into BigQuery".format(month))
 
     def iter_dataset_urls(self, session):
-        """Extract CSV file URLs from the dataset page."""
-        datasets_url = "https://opendata.nhsbsa.net/dataset/secondary-care-medicines-data-indicative-price"
+        """Extract CSV file URLs via the API"""
+        dataset_name = "secondary-care-medicines-data-indicative-price"
+        dataset_url = f"https://opendata.nhsbsa.net/api/3/action/package_show?id={dataset_name}"
 
-        # scrape available datasets
-        r = session.get(datasets_url)
+        r = session.get(dataset_url)
         r.raise_for_status()
 
-        doc = BeautifulSoup(r.text, "html.parser")
+        data = r.json()
+        resources = data['result']['resources']
 
-        for a in doc.find_all("a", href=True):
-            if a["href"].endswith(".csv"):
-                yield a["href"]
+        pattern = r"scmd_(final|provisional|wip)_[0-9]{6}\.csv"
+
+        for resource in resources:
+            if resource['format'].upper() == 'CSV' and re.search(pattern, resource['url'].split('/')[-1]):
+                yield resource['url']
 
     def iter_months(self, urls):
         """
         Extract a "month" from each URL given.
 
-        URLs are expected to end in the format `/SCMD_<something>_<year><month>.csv`, from
+        URLs are expected to end in the format `/scmd_<something>_<year><month>.csv`, from
         that we get the year and month, converting them to the format
         <year>-<month>.
         """