diff --git a/.github/workflows/scrape-groceries.yml b/.github/workflows/scrape-groceries.yml index 790e2f2..b4800ee 100644 --- a/.github/workflows/scrape-groceries.yml +++ b/.github/workflows/scrape-groceries.yml @@ -14,16 +14,21 @@ jobs: steps: - uses: actions/checkout@v4 - run: pip3 install -r requirements.txt - - run: python3 main.py sync woolies - - uses: actions/upload-artifact@v3 - with: - name: woolies_snapshot - path: ./output/woolies/ - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v4 with: role-to-assume: "${{ vars.TARGET_ROLE_ARN }}" aws-region: ap-southeast-2 + # it's okay to ignore the exist code on the last line, if the file doesn't exist then we scrape it + - run: | + save_path="$(python3 main.py sync --print-save-path woolies)" + mkdir -p ./output/ + aws s3 cp "s3://grocery-scrape-au/${save_path}" "./output/${save_path}" || true + - run: python3 main.py sync woolies --skip-existing + - uses: actions/upload-artifact@v3 + with: + name: woolies_snapshot + path: ./output/woolies/ - run: aws s3 sync ./output/woolies/ s3://grocery-scrape-au/woolies/ scrape-coles: permissions: diff --git a/hotprices_au/output.py b/hotprices_au/output.py index 6fafb65..351f702 100644 --- a/hotprices_au/output.py +++ b/hotprices_au/output.py @@ -4,7 +4,7 @@ from datetime import datetime -def get_save_path(store, output_dir, compression, day=None): +def get_save_path(store, output_dir, compression='gzip', day=None): if day is None: day = datetime.now() date_str = day.strftime("%Y-%m-%d") @@ -18,13 +18,12 @@ def get_save_path(store, output_dir, compression, day=None): return fpath -def save_data(store, categories, output_dir, compression='gzip'): - fpath = get_save_path(store, output_dir, compression) +def save_data(categories, save_path, compression='gzip'): if compression == 'gzip': - with gzip.open(fpath, 'wt') as fp: + with gzip.open(save_path, 'wt') as fp: fp.write(json.dumps(categories)) elif compression is None: - fpath.write_text(json.dumps(categories)) + save_path.write_text(json.dumps(categories)) else: raise RuntimeError(f"Unsupported compression '{compression}'") diff --git a/hotprices_au/sites/coles.py b/hotprices_au/sites/coles.py index 1b38296..1f91cc0 100644 --- a/hotprices_au/sites/coles.py +++ b/hotprices_au/sites/coles.py @@ -156,7 +156,7 @@ def parse_str_unit(size): return units.parse_str_unit(size) -def main(quick, output_dir): +def main(quick, save_path): coles = ColesScraper(store_id='0584', quick=quick) categories = coles.get_categories() #categories = load_cache() @@ -171,7 +171,7 @@ def main(quick, output_dir): if quick: break #save_cache(categories) - output.save_data('coles', categories, output_dir) + output.save_data(categories, save_path) get_category_mapping(categories) #print(json.dumps(category, indent=4)) diff --git a/hotprices_au/sites/woolies.py b/hotprices_au/sites/woolies.py index 0311815..f8edbb8 100644 --- a/hotprices_au/sites/woolies.py +++ b/hotprices_au/sites/woolies.py @@ -168,7 +168,7 @@ def get_canonical(item, today): return result -def main(quick, output_dir): +def main(quick, save_path): woolies = WooliesAPI(quick=quick) categories = woolies.get_categories() #categories = load_cache() @@ -182,7 +182,7 @@ def main(quick, output_dir): if quick: break - output.save_data('woolies', categories, output_dir) + output.save_data(categories, save_path) get_category_mapping(categories) diff --git a/main.py b/main.py index c428d23..41374a4 100644 --- a/main.py +++ b/main.py @@ -2,11 +2,20 @@ import logging from datetime import datetime import pathlib -from hotprices_au import sites, analysis +from hotprices_au import sites, analysis, output def main_sync(args): - sites.sites[args.store].main(args.quick, args.output_dir) + save_path = output.get_save_path(args.store, args.output_dir) + if args.print_save_path: + print(save_path.relative_to(args.output_dir), end='') + elif args.skip_existing and save_path.exists(): + print( + f'Skipping because outputfile {save_path} already exists and ' + f'requested to skip if output file exists.' + ) + else: + sites.sites[args.store].main(args.quick, save_path) def main_analysis(args): @@ -30,6 +39,10 @@ def main(): sync_parser = subparsers.add_parser('sync') sync_parser.add_argument('--quick', action='store_true', default=False) + sync_parser.add_argument( + '--print-save-path', action='store_true', default=False, + help='Print relative path where file will be stored, then exit') + sync_parser.add_argument('--skip-existing', action='store_true', default=False) sync_parser.add_argument('store', choices=list(sites.sites)) sync_parser.set_defaults(func=main_sync)