Skip to content

Commit

Permalink
get dataset urls from the bsa open data API instead of scraping them
Browse files Browse the repository at this point in the history
  • Loading branch information
LFISHER7 committed May 24, 2024
1 parent b9f2afe commit 96d996b
Showing 1 changed file with 13 additions and 10 deletions.
23 changes: 13 additions & 10 deletions openprescribing/pipeline/management/commands/import_scmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import tempfile

import requests
from bs4 import BeautifulSoup
from django.core.management import BaseCommand
from gcutils.bigquery import Client, build_schema
from google.cloud.exceptions import Conflict
Expand Down Expand Up @@ -98,24 +97,28 @@ def handle(self, *args, **kwargs):
print("{} | Ingested into BigQuery".format(month))

def iter_dataset_urls(self, session):
"""Extract CSV file URLs from the dataset page."""
datasets_url = "https://opendata.nhsbsa.net/dataset/secondary-care-medicines-data-indicative-price"
"""Extract CSV file URLs via the API"""
dataset_name = "secondary-care-medicines-data-indicative-price"
dataset_url = f"https://opendata.nhsbsa.net/api/3/action/package_show?id={dataset_name}"

# scrape available datasets
r = session.get(datasets_url)
r = session.get(dataset_url)
r.raise_for_status()

doc = BeautifulSoup(r.text, "html.parser")
data = r.json()
resources = data['result']['resources']

pattern = r"scmd_(final|provisional|wip)_[0-9]{6}\.csv"

for resource in resources:
if resource['format'].upper() == 'CSV' and re.search(pattern, resource['url'].split('/')[-1]):
yield resource['url']

for a in doc.find_all("a", href=True):
if a["href"].endswith(".csv"):
yield a["href"]

def iter_months(self, urls):
"""
Extract a "month" from each URL given.
URLs are expected to end in the format `/SCMD_<something>_<year><month>.csv`, from
URLs are expected to end in the format `/scmd_<something>_<year><month>.csv`, from
that we get the year and month, converting them to the format
<year>-<month>.
"""
Expand Down

0 comments on commit 96d996b

Please sign in to comment.