Skip to content

Commit

Permalink
Skip woolies scrape if file already exists
Browse files Browse the repository at this point in the history
* Add option to print the path a file would saved so we know what to
  fetch from S3
* Add option to skip existing files to avoid double scrape
* Update actions file to use new option and download file if it exists
  • Loading branch information
Javex committed Oct 18, 2023
1 parent 92ab24d commit cae066b
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 16 deletions.
15 changes: 10 additions & 5 deletions .github/workflows/scrape-groceries.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,21 @@ jobs:
steps:
- uses: actions/checkout@v4
- run: pip3 install -r requirements.txt
- run: python3 main.py sync woolies
- uses: actions/upload-artifact@v3
with:
name: woolies_snapshot
path: ./output/woolies/
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
# it's okay to ignore the exist code on the last line, if the file doesn't exist then we scrape it
- run: |
save_path="$(python3 main.py sync --print-save-path woolies)"
mkdir -p ./output/
aws s3 cp "s3://grocery-scrape-au/${save_path}" "./output/${save_path}" || true
- run: python3 main.py sync woolies --skip-existing
- uses: actions/upload-artifact@v3
with:
name: woolies_snapshot
path: ./output/woolies/
- run: aws s3 sync ./output/woolies/ s3://grocery-scrape-au/woolies/
scrape-coles:
permissions:
Expand Down
9 changes: 4 additions & 5 deletions hotprices_au/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from datetime import datetime


def get_save_path(store, output_dir, compression, day=None):
def get_save_path(store, output_dir, compression='gzip', day=None):
if day is None:
day = datetime.now()
date_str = day.strftime("%Y-%m-%d")
Expand All @@ -18,13 +18,12 @@ def get_save_path(store, output_dir, compression, day=None):
return fpath


def save_data(store, categories, output_dir, compression='gzip'):
fpath = get_save_path(store, output_dir, compression)
def save_data(categories, save_path, compression='gzip'):
if compression == 'gzip':
with gzip.open(fpath, 'wt') as fp:
with gzip.open(save_path, 'wt') as fp:
fp.write(json.dumps(categories))
elif compression is None:
fpath.write_text(json.dumps(categories))
save_path.write_text(json.dumps(categories))
else:
raise RuntimeError(f"Unsupported compression '{compression}'")

Expand Down
4 changes: 2 additions & 2 deletions hotprices_au/sites/coles.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def parse_str_unit(size):
return units.parse_str_unit(size)


def main(quick, output_dir):
def main(quick, save_path):
coles = ColesScraper(store_id='0584', quick=quick)
categories = coles.get_categories()
#categories = load_cache()
Expand All @@ -171,7 +171,7 @@ def main(quick, output_dir):
if quick:
break
#save_cache(categories)
output.save_data('coles', categories, output_dir)
output.save_data(categories, save_path)
get_category_mapping(categories)
#print(json.dumps(category, indent=4))

Expand Down
4 changes: 2 additions & 2 deletions hotprices_au/sites/woolies.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def get_canonical(item, today):
return result


def main(quick, output_dir):
def main(quick, save_path):
woolies = WooliesAPI(quick=quick)
categories = woolies.get_categories()
#categories = load_cache()
Expand All @@ -182,7 +182,7 @@ def main(quick, output_dir):

if quick:
break
output.save_data('woolies', categories, output_dir)
output.save_data(categories, save_path)
get_category_mapping(categories)


Expand Down
17 changes: 15 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,20 @@
import logging
from datetime import datetime
import pathlib
from hotprices_au import sites, analysis
from hotprices_au import sites, analysis, output


def main_sync(args):
sites.sites[args.store].main(args.quick, args.output_dir)
save_path = output.get_save_path(args.store, args.output_dir)
if args.print_save_path:
print(save_path.relative_to(args.output_dir), end='')
elif args.skip_existing and save_path.exists():
print(
f'Skipping because outputfile {save_path} already exists and '
f'requested to skip if output file exists.'
)
else:
sites.sites[args.store].main(args.quick, save_path)


def main_analysis(args):
Expand All @@ -30,6 +39,10 @@ def main():

sync_parser = subparsers.add_parser('sync')
sync_parser.add_argument('--quick', action='store_true', default=False)
sync_parser.add_argument(
'--print-save-path', action='store_true', default=False,
help='Print relative path where file will be stored, then exit')
sync_parser.add_argument('--skip-existing', action='store_true', default=False)
sync_parser.add_argument('store', choices=list(sites.sites))
sync_parser.set_defaults(func=main_sync)

Expand Down

0 comments on commit cae066b

Please sign in to comment.