-
Notifications
You must be signed in to change notification settings - Fork 4
109 lines (109 loc) · 4.43 KB
/
scrape-groceries.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
name: Scrape Groceries
on:
workflow_dispatch:
schedule:
- cron: "0 1 * * *" # Daily at 1am UTC (12pm AEST)
env:
AWS_REGION: ap-southeast-2
jobs:
scrape-woolies:
permissions:
contents: read # Required for checkout action
id-token: write # This is required for requesting the JWT
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: pip3 install -r requirements.txt
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
# it's okay to ignore the exit code on the last line, if the file doesn't exist then we scrape it
- run: |
save_path="$(python3 main.py sync --print-save-path woolies)"
echo "Save path is ${save_path}"
mkdir -p ./output/
aws s3 cp "s3://grocery-scrape-au/${save_path}" "./output/${save_path}" || true
- run: python3 main.py sync woolies --skip-existing
- uses: actions/upload-artifact@v3
with:
name: woolies_snapshot
path: ./output/woolies/
# Need to refresh credentials, the job can run for a while and they expire
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
- run: aws s3 sync ./output/woolies/ s3://grocery-scrape-au/woolies/
scrape-coles:
permissions:
contents: read # Required for checkout action
id-token: write # This is required for requesting the JWT
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: pip3 install -r requirements.txt
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
# it's okay to ignore the exit code on the last line, if the file doesn't exist then we scrape it
- run: |
save_path="$(python3 main.py sync --print-save-path coles)"
echo "Save path is ${save_path}"
mkdir -p ./output/
aws s3 cp "s3://grocery-scrape-au/${save_path}" "./output/${save_path}" || true
- run: python3 main.py sync coles --skip-existing
- uses: actions/upload-artifact@v3
with:
name: coles_snapshot
path: ./output/coles/
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
- run: aws s3 sync ./output/coles/ s3://grocery-scrape-au/coles/
merge-price-history:
permissions:
contents: read # Required for checkout action
id-token: write # This is required for requesting the JWT
runs-on: ubuntu-latest
needs:
- scrape-woolies
- scrape-coles
steps:
- uses: actions/checkout@v4
- name: Download coles artifact
uses: actions/download-artifact@v3
with:
name: coles_snapshot
path: ./output/coles/
- name: Download woolies artifact
uses: actions/download-artifact@v3
with:
name: woolies_snapshot
path: ./output/woolies/
- run: pip3 install -r requirements.txt
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: "${{ vars.TARGET_ROLE_ARN }}"
aws-region: ap-southeast-2
# Use "cp" because it is an error if the file doesn't exist. It means history
# is broken and needs to be built from scratch locally.
- run: aws s3 cp s3://grocery-scrape-au/latest-canonical.json.gz ./output/
- run: python3 main.py analysis --compress
- uses: actions/upload-artifact@v3
with:
name: latest_canonical
path: ./output/latest-canonical.json.gz
- name: Upload all files after finished analysis
# The content-encoding is necessary so that S3 sends the correct content-encoding header on GET
run: |
aws s3 sync ./output/ s3://grocery-scrape-au/
aws s3 cp --content-encoding gzip static/data/latest-canonical.woolies.compressed.json.gz s3://hotprices.org/data/
aws s3 cp --content-encoding gzip static/data/latest-canonical.coles.compressed.json.gz s3://hotprices.org/data/