Add coles scraping

Javex · Sep 26, 2023 · 590d1d6 · 590d1d6
1 parent f58a414
commit 590d1d6
Show file tree

Hide file tree

Showing 3 changed files with 135 additions and 3 deletions.
diff --git a/.github/workflows/scrape-woolies.yml b/.github/workflows/scrape-woolies.yml
@@ -1,4 +1,4 @@
-name: Scrape Woolies
+name: Scrape Groceries
 on:
   push:
   schedule:
@@ -24,4 +24,23 @@ jobs:
         with:
           role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
           aws-region: ap-southeast-2
-      - run: aws s3 sync ./woolies/ s3://grocery-scrape-au/woolies/
+      - run: aws s3 sync ./woolies/ s3://grocery-scrape-au/woolies/
+  scrape-coles:
+    permissions:
+      contents: read  # Required for checkout action
+      id-token: write # This is required for requesting the JWT
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - run: pip3 install -r requirements.txt
+      - run: python3 coles.py
+      - uses: actions/upload-artifact@v3
+        with:
+          name: coles_snapshot
+          path: ./coles/*.json
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
+          aws-region: ap-southeast-2
+      - run: aws s3 sync ./coles/ s3://grocery-scrape-au/coles/
diff --git a/coles.py b/coles.py
@@ -0,0 +1,112 @@
+import requests
+import json
+import sys
+import pathlib
+from datetime import datetime
+from bs4 import BeautifulSoup
+
+
+class ColesScraper:
+
+    def __init__(self, store_id, quick=False):
+        self.quick = quick
+        self.store_id = store_id
+
+        self.session = requests.Session()
+        self.session.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
+            'Origin': 'https://www.coles.com.au',
+            'Referer': 'https://www.coles.com.au',
+        }
+        self.start()
+
+    def start(self):
+        # Need to get the subscription key
+        response = self.session.get('https://www.coles.com.au')
+        response.raise_for_status()
+        html = BeautifulSoup(response.text, features="html.parser")
+        next_data_script = html.find("script", id="__NEXT_DATA__")
+        next_data_json = json.loads(next_data_script.string)
+        self.api_key = next_data_json['runtimeConfig']['BFF_API_SUBSCRIPTION_KEY']
+        self.session.headers['ocp-apim-subscription-key'] = self.api_key
+
+    def get_category(self, cat_slug):
+        params = {
+            'slug': cat_slug,
+            'page': 1,
+        }
+        product_count = 0
+        while True:
+            print(f'Page {params["page"]}')
+            response = self.session.get(f'https://www.coles.com.au/_next/data/20230922.01_v3.52.0/en/browse/{cat_slug}.json', params=params)
+            response.raise_for_status()
+            response_data = response.json()
+            search_results = response_data['pageProps']['searchResults']
+            for result in search_results['results']:
+                yield result
+
+            # Next page calculation
+            total_products = search_results['noOfResults']
+            product_count += len(search_results['results'])
+            if product_count >= total_products:
+                # We're done
+                break
+
+            # Temporary speedup
+            if self.quick:
+                break
+
+            # Not done, go to next page
+            params['page'] += 1
+
+    def get_categories(self):
+        response = self.session.get(f'https://www.coles.com.au/api/bff/products/categories?storeId={self.store_id}')
+        response.raise_for_status()
+        category_data = response.json()
+        categories = category_data['catalogGroupView']
+        return categories
+
+
+def save_data(categories):
+    now = datetime.now()
+    date_str = now.strftime("%Y-%m-%d")
+    fname = f"{date_str}.json"
+    save_dir = pathlib.Path(f"coles")
+    save_dir.mkdir(exist_ok=True)
+    fpath = save_dir / fname
+    fpath.write_text(json.dumps(categories))
+
+
+def main():
+    quick = False
+    if len(sys.argv) > 1 and sys.argv[1] == "--quick":
+        quick = True
+    coles = ColesScraper(store_id='0584', quick=quick)
+    categories = coles.get_categories()
+    #categories = load_cache()
+    for category_obj in categories:
+        cat_slug = category_obj['seoToken']
+
+        if cat_slug in ['down-down', 'back-to-school']:
+            # Skip for now, expect duplicate products
+            continue
+
+        if 'Products' in category_obj:
+            # Already cached
+            continue
+
+        cat_desc = category_obj['name']
+        print(f'Fetching category {cat_slug} ({cat_desc})')
+        category = coles.get_category(cat_slug)
+        all_category_bundles = list(category)
+        category_obj['Products'] = all_category_bundles
+
+        if quick:
+            break
+        #save_cache(categories)
+    save_data(categories)
+    #print(json.dumps(category, indent=4))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
-requests
+requests
+beautifulsoup4