Skip to content

Daily Crawler

Daily Crawler #12

Workflow file for this run

name: Daily Crawler
on:
schedule:
- cron: '0 15 * * 0' # UTC 기준 매주 일요일 15:00 (한국 시간 월요일 자정)
workflow_dispatch: # 수동 실행 옵션
jobs:
crawl:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.12.4'
- name: Cache pip packages
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Setup Chrome
uses: browser-actions/setup-chrome@latest
- name: Run crawler
run: python crawler.py --verbose --workflow
env:
PYTHONUNBUFFERED: 1
- name: Commit and push changes
run: |
git config --global user.email "[email protected]"
git config --global user.name "xeros"
# 각 카테고리별로 변경사항 확인 및 커밋
for file in dataset/*.json; do
category=$(basename "$file" .json)
git add "$file"
if git diff --staged --quiet; then
echo "No changes in $category"
else
git commit -m "Update data for $category"
git push
fi
done
# history 파일 커밋
today=$(date +%Y-%m-%d)
history_file="dataset/history/${today}.json"
if [ -f "$history_file" ]; then
git add "$history_file"
git commit -m "Add history data for $today"
git push
fi
- name: Check crawling results
run: |
echo "Crawling results:"
for file in dataset/*.json; do
category=$(basename "$file" .json)
count=$(jq length "$file")
echo "$category: $count products"
done
# 누락된 카테고리 확인
expected_categories=$(jq -r 'keys[]' target-list.json)
for category in $expected_categories; do
if [ ! -f "dataset/${category}.json" ]; then
echo "Missing category: ${category}"
fi
done
# 히스토리 파일 확인
if [ -f "$history_file" ]; then
echo "Today's history file created: ${today}.json"
else
echo "Error: Today's history file not found"
fi