Skip to content

Commit

Permalink
Fix: 에러 추적
Browse files Browse the repository at this point in the history
  • Loading branch information
x3r0s committed Oct 23, 2024
1 parent 056a5bf commit fcb2fb3
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 12 deletions.
17 changes: 14 additions & 3 deletions .github/workflows/crawler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,26 @@ jobs:
- name: Install Chrome
uses: browser-actions/setup-chrome@latest

- name: Check directory before crawling
run: ls -R dataset

- name: Run crawler
run: python crawler.py
continue-on-error: true
run: python crawler.py --verbose
env:
PYTHONUNBUFFERED: 1

- name: Check directory after crawling
run: ls -R dataset

- name: Compress crawling results
run: tar -czvf crawling-results.tar.gz dataset/

- name: Upload crawling results
uses: actions/upload-artifact@v3
if: always()
with:
name: crawling-results
path: dataset/
path: crawling-results.tar.gz

- name: Commit and push if changed
run: |
Expand Down
27 changes: 18 additions & 9 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from datetime import datetime
import argparse
import shutil
import traceback

# 프로젝트 루트 디렉토리 설정
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -121,6 +122,7 @@ def append_to_json(product, filename):

# 카테고리별 크롤링 함수
def crawl_category(url, category, save_images):
print(f"카테고리 '{category}' 크롤링 시작")
try:
driver = setup_driver()
driver.get(url)
Expand Down Expand Up @@ -179,7 +181,8 @@ def crawl_category(url, category, save_images):
break

except Exception as e:
print(f"{category} 카테고리 처리 중 오류 발생: {e}")
print(f"카테고리 '{category}' 크롤링 중 오류 발생: {str(e)}")
print(traceback.format_exc())
finally:
driver.quit()

Expand All @@ -199,24 +202,30 @@ def compress_data(output_dir):
print(f"데이터 압축 완료: {zip_filename}")

# 메인 함수
def main(save_images=False):
def main(save_images=False, verbose=False):
if verbose:
print("상세 로그 모드 활성화")

with open(os.path.join(PROJECT_ROOT, 'target-list.json'), 'r') as f:
targets = json.load(f)

threads = []
for category, url in targets.items():
try:
print(f"{category} 크롤링 시작")
crawl_category(url, category, save_images)
print(f"{category} 크롤링 완료")
except Exception as e:
print(f"{category} 크롤링 중 오류 발생: {e}")
thread = threading.Thread(target=crawl_category, args=(url, category, save_images))
threads.append(thread)
thread.start()
print(f"{category} 크롤링 시작")

for thread in threads:
thread.join()

output_dir = os.path.join(PROJECT_ROOT, 'dataset')
compress_data(output_dir)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Danawa 제품 정보 크롤러')
parser.add_argument('--save-images', action='store_true', help='이미지 저장 여부')
parser.add_argument('--verbose', action='store_true', help='상세 로그 출력')
args = parser.parse_args()

main(save_images=args.save_images)
main(save_images=args.save_images, verbose=args.verbose)

0 comments on commit fcb2fb3

Please sign in to comment.