Fix: 에러 추적

x3r0s · Oct 23, 2024 · fcb2fb3 · fcb2fb3
1 parent 056a5bf
commit fcb2fb3
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 12 deletions.
diff --git a/.github/workflows/crawler.yml b/.github/workflows/crawler.yml
@@ -32,15 +32,26 @@ jobs:
     - name: Install Chrome
       uses: browser-actions/setup-chrome@latest
 
+    - name: Check directory before crawling
+      run: ls -R dataset
+
     - name: Run crawler
-      run: python crawler.py
-      continue-on-error: true
+      run: python crawler.py --verbose
+      env:
+        PYTHONUNBUFFERED: 1
+
+    - name: Check directory after crawling
+      run: ls -R dataset
+
+    - name: Compress crawling results
+      run: tar -czvf crawling-results.tar.gz dataset/
 
     - name: Upload crawling results
       uses: actions/upload-artifact@v3
+      if: always()
       with:
         name: crawling-results
-        path: dataset/
+        path: crawling-results.tar.gz
 
     - name: Commit and push if changed
       run: |

diff --git a/crawler.py b/crawler.py
@@ -15,6 +15,7 @@
 from datetime import datetime
 import argparse
 import shutil
+import traceback
 
 # 프로젝트 루트 디렉토리 설정
 PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
@@ -121,6 +122,7 @@ def append_to_json(product, filename):
 
 # 카테고리별 크롤링 함수
 def crawl_category(url, category, save_images):
+    print(f"카테고리 '{category}' 크롤링 시작")
     try:
         driver = setup_driver()
         driver.get(url)
@@ -179,7 +181,8 @@ def crawl_category(url, category, save_images):
                 break
 
     except Exception as e:
-        print(f"{category} 카테고리 처리 중 오류 발생: {e}")
+        print(f"카테고리 '{category}' 크롤링 중 오류 발생: {str(e)}")
+        print(traceback.format_exc())
     finally:
         driver.quit()
 
@@ -199,24 +202,30 @@ def compress_data(output_dir):
     print(f"데이터 압축 완료: {zip_filename}")
 
 # 메인 함수
-def main(save_images=False):
+def main(save_images=False, verbose=False):
+    if verbose:
+        print("상세 로그 모드 활성화")
+
     with open(os.path.join(PROJECT_ROOT, 'target-list.json'), 'r') as f:
         targets = json.load(f)
 
+    threads = []
     for category, url in targets.items():
-        try:
-            print(f"{category} 크롤링 시작")
-            crawl_category(url, category, save_images)
-            print(f"{category} 크롤링 완료")
-        except Exception as e:
-            print(f"{category} 크롤링 중 오류 발생: {e}")
+        thread = threading.Thread(target=crawl_category, args=(url, category, save_images))
+        threads.append(thread)
+        thread.start()
+        print(f"{category} 크롤링 시작")
+
+    for thread in threads:
+        thread.join()
 
     output_dir = os.path.join(PROJECT_ROOT, 'dataset')
     compress_data(output_dir)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Danawa 제품 정보 크롤러')
     parser.add_argument('--save-images', action='store_true', help='이미지 저장 여부')
+    parser.add_argument('--verbose', action='store_true', help='상세 로그 출력')
     args = parser.parse_args()
 
-    main(save_images=args.save_images)
+    main(save_images=args.save_images, verbose=args.verbose)