diff --git a/.github/workflows/scrape-woolies.yml b/.github/workflows/scrape-woolies.yml index 0dcf0ff..74da27d 100644 --- a/.github/workflows/scrape-woolies.yml +++ b/.github/workflows/scrape-woolies.yml @@ -1,4 +1,4 @@ -name: Scrape Woolies +name: Scrape Groceries on: push: schedule: @@ -24,4 +24,23 @@ jobs: with: role-to-assume: "${{ vars.TARGET_ROLE_ARN }}" aws-region: ap-southeast-2 - - run: aws s3 sync ./woolies/ s3://grocery-scrape-au/woolies/ \ No newline at end of file + - run: aws s3 sync ./woolies/ s3://grocery-scrape-au/woolies/ + scrape-coles: + permissions: + contents: read # Required for checkout action + id-token: write # This is required for requesting the JWT + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: pip3 install -r requirements.txt + - run: python3 coles.py + - uses: actions/upload-artifact@v3 + with: + name: coles_snapshot + path: ./coles/*.json + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: "${{ vars.TARGET_ROLE_ARN }}" + aws-region: ap-southeast-2 + - run: aws s3 sync ./coles/ s3://grocery-scrape-au/coles/ \ No newline at end of file diff --git a/coles.py b/coles.py new file mode 100644 index 0000000..8779c9d --- /dev/null +++ b/coles.py @@ -0,0 +1,112 @@ +import requests +import json +import sys +import pathlib +from datetime import datetime +from bs4 import BeautifulSoup + + +class ColesScraper: + + def __init__(self, store_id, quick=False): + self.quick = quick + self.store_id = store_id + + self.session = requests.Session() + self.session.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', + 'Origin': 'https://www.coles.com.au', + 'Referer': 'https://www.coles.com.au', + } + self.start() + + def start(self): + # Need to get the subscription key + response = self.session.get('https://www.coles.com.au') + response.raise_for_status() + html = BeautifulSoup(response.text, features="html.parser") + next_data_script = html.find("script", id="__NEXT_DATA__") + next_data_json = json.loads(next_data_script.string) + self.api_key = next_data_json['runtimeConfig']['BFF_API_SUBSCRIPTION_KEY'] + self.session.headers['ocp-apim-subscription-key'] = self.api_key + + def get_category(self, cat_slug): + params = { + 'slug': cat_slug, + 'page': 1, + } + product_count = 0 + while True: + print(f'Page {params["page"]}') + response = self.session.get(f'https://www.coles.com.au/_next/data/20230922.01_v3.52.0/en/browse/{cat_slug}.json', params=params) + response.raise_for_status() + response_data = response.json() + search_results = response_data['pageProps']['searchResults'] + for result in search_results['results']: + yield result + + # Next page calculation + total_products = search_results['noOfResults'] + product_count += len(search_results['results']) + if product_count >= total_products: + # We're done + break + + # Temporary speedup + if self.quick: + break + + # Not done, go to next page + params['page'] += 1 + + def get_categories(self): + response = self.session.get(f'https://www.coles.com.au/api/bff/products/categories?storeId={self.store_id}') + response.raise_for_status() + category_data = response.json() + categories = category_data['catalogGroupView'] + return categories + + +def save_data(categories): + now = datetime.now() + date_str = now.strftime("%Y-%m-%d") + fname = f"{date_str}.json" + save_dir = pathlib.Path(f"coles") + save_dir.mkdir(exist_ok=True) + fpath = save_dir / fname + fpath.write_text(json.dumps(categories)) + + +def main(): + quick = False + if len(sys.argv) > 1 and sys.argv[1] == "--quick": + quick = True + coles = ColesScraper(store_id='0584', quick=quick) + categories = coles.get_categories() + #categories = load_cache() + for category_obj in categories: + cat_slug = category_obj['seoToken'] + + if cat_slug in ['down-down', 'back-to-school']: + # Skip for now, expect duplicate products + continue + + if 'Products' in category_obj: + # Already cached + continue + + cat_desc = category_obj['name'] + print(f'Fetching category {cat_slug} ({cat_desc})') + category = coles.get_category(cat_slug) + all_category_bundles = list(category) + category_obj['Products'] = all_category_bundles + + if quick: + break + #save_cache(categories) + save_data(categories) + #print(json.dumps(category, indent=4)) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 663bd1f..a98ae43 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -requests \ No newline at end of file +requests +beautifulsoup4 \ No newline at end of file