triger exception when not related to bot detcction

WaveGenAI · Sep 28, 2024 · 4ddf197 · 4ddf197
1 parent b802816
commit 4ddf197
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 69 deletions.
diff --git a/main.py b/main.py
@@ -57,7 +57,7 @@
 
             if line.startswith("youtube:"):
                 crawlers = YoutubeCrawler(
-                    line.split(" ", 1)[1], callback=exporter, num_processes=5
+                    line.split(" ", 1)[1], callback=exporter, num_processes=10
                 )
             else:
                 crawlers = ArchiveCrawler(line.split(" ", 1)[1], callback=exporter)

diff --git a/multi_crawler/crawlers/youtube_crawler.py b/multi_crawler/crawlers/youtube_crawler.py
@@ -46,14 +46,10 @@ def _get_ytb_data(self, url):
         # append a new session
         self._ytb_sessions[time.time()] = session
 
-        info = session.extract_info(url, download=False)
-
-        if (
-            "categories" in info
-            and info["categories"] is not None
-            and not "Music" in info["categories"]
-        ):
-            logging.info("Skipping non-music video: %s", info["title"])
+        try:
+            info = session.extract_info(url, download=False)
+        except Exception as e:
+            logging.error("Error extracting info from %s: %s", url, e)
             return
 
         logging.info("Found music video: %s", info["title"])
@@ -120,7 +116,7 @@ def _get_youtube_init_data(self, url):
         context = None
 
         try:
-            response = requests.get(url)
+            response = requests.get(url, timeout=10)
             page_content = response.text
 
             yt_init_data = page_content.split("var ytInitialData =")
@@ -149,7 +145,7 @@ def _get_youtube_init_data(self, url):
                 }
             else:
                 print("cannot_get_init_data")
-                raise Exception("cannot_get_init_data")
+                raise Exception("Cannot get init data")
 
         except Exception as ex:
             print(ex)
@@ -199,7 +195,9 @@ def _next_page(self, next_page):
         endpoint = f"{self.YOUTUBE_ENDPOINT}/youtubei/v1/search?key={next_page['nextPageToken']}"
 
         try:
-            response = requests.post(endpoint, json=next_page["nextPageContext"])
+            response = requests.post(
+                endpoint, json=next_page["nextPageContext"], timeout=10
+            )
             page_data = response.json()
 
             item1 = page_data["onResponseReceivedCommands"][0][

diff --git a/multi_crawler/poo_generator.py b/multi_crawler/poo_generator.py
diff --git a/multi_crawler/scripts/poo_gen.sh b/multi_crawler/scripts/poo_gen.sh
diff --git a/multi_crawler/ytb_session.py b/multi_crawler/ytb_session.py
@@ -48,12 +48,14 @@ def _handle_download_error(self, method_name: str, *args, **kwargs) -> Any:
         method = getattr(self.ytdl, method_name)
         try:
             return method(*args, **kwargs)
-        except DownloadError:
-            logging.warning(
-                "DownloadError in %s, reinitializing with new proxy...", method_name
-            )
-            self._init_ytdl()
-            return self._handle_download_error(method_name, *args, **kwargs)
+        except DownloadError as e:
+            if "bot" in str(e).lower():
+                logging.warning(
+                    "DownloadError in %s, reinitializing with new proxy...", method_name
+                )
+                self._init_ytdl()
+                return self._handle_download_error(method_name, *args, **kwargs)
+            raise e
 
     def extract_info(self, *args, **kwargs):
         """Extracts information and handles DownloadError by reinitializing YoutubeDL."""