Skip to content

Commit

Permalink
Add coles scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
Javex committed Sep 26, 2023
1 parent f58a414 commit 590d1d6
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 3 deletions.
23 changes: 21 additions & 2 deletions .github/workflows/scrape-woolies.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Scrape Woolies
name: Scrape Groceries
on:
push:
schedule:
Expand All @@ -24,4 +24,23 @@ jobs:
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
- run: aws s3 sync ./woolies/ s3://grocery-scrape-au/woolies/
- run: aws s3 sync ./woolies/ s3://grocery-scrape-au/woolies/
scrape-coles:
permissions:
contents: read # Required for checkout action
id-token: write # This is required for requesting the JWT
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: pip3 install -r requirements.txt
- run: python3 coles.py
- uses: actions/upload-artifact@v3
with:
name: coles_snapshot
path: ./coles/*.json
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
- run: aws s3 sync ./coles/ s3://grocery-scrape-au/coles/
112 changes: 112 additions & 0 deletions coles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import requests
import json
import sys
import pathlib
from datetime import datetime
from bs4 import BeautifulSoup


class ColesScraper:

def __init__(self, store_id, quick=False):
self.quick = quick
self.store_id = store_id

self.session = requests.Session()
self.session.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Origin': 'https://www.coles.com.au',
'Referer': 'https://www.coles.com.au',
}
self.start()

def start(self):
# Need to get the subscription key
response = self.session.get('https://www.coles.com.au')
response.raise_for_status()
html = BeautifulSoup(response.text, features="html.parser")
next_data_script = html.find("script", id="__NEXT_DATA__")
next_data_json = json.loads(next_data_script.string)
self.api_key = next_data_json['runtimeConfig']['BFF_API_SUBSCRIPTION_KEY']
self.session.headers['ocp-apim-subscription-key'] = self.api_key

def get_category(self, cat_slug):
params = {
'slug': cat_slug,
'page': 1,
}
product_count = 0
while True:
print(f'Page {params["page"]}')
response = self.session.get(f'https://www.coles.com.au/_next/data/20230922.01_v3.52.0/en/browse/{cat_slug}.json', params=params)
response.raise_for_status()
response_data = response.json()
search_results = response_data['pageProps']['searchResults']
for result in search_results['results']:
yield result

# Next page calculation
total_products = search_results['noOfResults']
product_count += len(search_results['results'])
if product_count >= total_products:
# We're done
break

# Temporary speedup
if self.quick:
break

# Not done, go to next page
params['page'] += 1

def get_categories(self):
response = self.session.get(f'https://www.coles.com.au/api/bff/products/categories?storeId={self.store_id}')
response.raise_for_status()
category_data = response.json()
categories = category_data['catalogGroupView']
return categories


def save_data(categories):
now = datetime.now()
date_str = now.strftime("%Y-%m-%d")
fname = f"{date_str}.json"
save_dir = pathlib.Path(f"coles")
save_dir.mkdir(exist_ok=True)
fpath = save_dir / fname
fpath.write_text(json.dumps(categories))


def main():
quick = False
if len(sys.argv) > 1 and sys.argv[1] == "--quick":
quick = True
coles = ColesScraper(store_id='0584', quick=quick)
categories = coles.get_categories()
#categories = load_cache()
for category_obj in categories:
cat_slug = category_obj['seoToken']

if cat_slug in ['down-down', 'back-to-school']:
# Skip for now, expect duplicate products
continue

if 'Products' in category_obj:
# Already cached
continue

cat_desc = category_obj['name']
print(f'Fetching category {cat_slug} ({cat_desc})')
category = coles.get_category(cat_slug)
all_category_bundles = list(category)
category_obj['Products'] = all_category_bundles

if quick:
break
#save_cache(categories)
save_data(categories)
#print(json.dumps(category, indent=4))


if __name__ == '__main__':
main()
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
requests
requests
beautifulsoup4

0 comments on commit 590d1d6

Please sign in to comment.