From 9420e460161da13d7e3cfd801495c6b009d02a8b Mon Sep 17 00:00:00 2001 From: Louis Fisher Date: Fri, 24 May 2024 15:04:59 +0100 Subject: [PATCH] get dataset urls from the bsa open data API instead of scraping them --- .../management/commands/import_scmd.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/openprescribing/pipeline/management/commands/import_scmd.py b/openprescribing/pipeline/management/commands/import_scmd.py index 180c753280..383da750b4 100644 --- a/openprescribing/pipeline/management/commands/import_scmd.py +++ b/openprescribing/pipeline/management/commands/import_scmd.py @@ -1,8 +1,8 @@ import csv import tempfile +import re import requests -from bs4 import BeautifulSoup from django.core.management import BaseCommand from gcutils.bigquery import Client, build_schema from google.cloud.exceptions import Conflict @@ -98,24 +98,27 @@ def handle(self, *args, **kwargs): print("{} | Ingested into BigQuery".format(month)) def iter_dataset_urls(self, session): - """Extract CSV file URLs from the dataset page.""" - datasets_url = "https://opendata.nhsbsa.net/dataset/secondary-care-medicines-data-indicative-price" + """Extract CSV file URLs via the API""" + dataset_name = "secondary-care-medicines-data-indicative-price" + dataset_url = f"https://opendata.nhsbsa.net/api/3/action/package_show?id={dataset_name}" - # scrape available datasets - r = session.get(datasets_url) + r = session.get(dataset_url) r.raise_for_status() - doc = BeautifulSoup(r.text, "html.parser") + data = r.json() + resources = data['result']['resources'] - for a in doc.find_all("a", href=True): - if a["href"].endswith(".csv"): - yield a["href"] + pattern = r"scmd_(final|provisional|wip)_[0-9]{6}\.csv" + + for resource in resources: + if resource['format'].upper() == 'CSV' and re.search(pattern, resource['url'].split('/')[-1]): + yield resource['url'] def iter_months(self, urls): """ Extract a "month" from each URL given. - URLs are expected to end in the format `/SCMD__.csv`, from + URLs are expected to end in the format `/scmd__.csv`, from that we get the year and month, converting them to the format -. """