diff --git a/openprescribing/pipeline/management/commands/import_scmd.py b/openprescribing/pipeline/management/commands/import_scmd.py index 180c753280..7aa0046fcd 100644 --- a/openprescribing/pipeline/management/commands/import_scmd.py +++ b/openprescribing/pipeline/management/commands/import_scmd.py @@ -2,7 +2,6 @@ import tempfile import requests -from bs4 import BeautifulSoup from django.core.management import BaseCommand from gcutils.bigquery import Client, build_schema from google.cloud.exceptions import Conflict @@ -98,24 +97,28 @@ def handle(self, *args, **kwargs): print("{} | Ingested into BigQuery".format(month)) def iter_dataset_urls(self, session): - """Extract CSV file URLs from the dataset page.""" - datasets_url = "https://opendata.nhsbsa.net/dataset/secondary-care-medicines-data-indicative-price" + """Extract CSV file URLs via the API""" + dataset_name = "secondary-care-medicines-data-indicative-price" + dataset_url = f"https://opendata.nhsbsa.net/api/3/action/package_show?id={dataset_name}" - # scrape available datasets - r = session.get(datasets_url) + r = session.get(dataset_url) r.raise_for_status() - doc = BeautifulSoup(r.text, "html.parser") + data = r.json() + resources = data['result']['resources'] + + pattern = r"scmd_(final|provisional|wip)_[0-9]{6}\.csv" + + for resource in resources: + if resource['format'].upper() == 'CSV' and re.search(pattern, resource['url'].split('/')[-1]): + yield resource['url'] - for a in doc.find_all("a", href=True): - if a["href"].endswith(".csv"): - yield a["href"] def iter_months(self, urls): """ Extract a "month" from each URL given. - URLs are expected to end in the format `/SCMD__.csv`, from + URLs are expected to end in the format `/scmd__.csv`, from that we get the year and month, converting them to the format -. """