Skip to content

Commit

Permalink
triger exception when not related to bot detcction
Browse files Browse the repository at this point in the history
  • Loading branch information
Jourdelune committed Sep 28, 2024
1 parent b802816 commit 4ddf197
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 69 deletions.
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@

if line.startswith("youtube:"):
crawlers = YoutubeCrawler(
line.split(" ", 1)[1], callback=exporter, num_processes=5
line.split(" ", 1)[1], callback=exporter, num_processes=10
)
else:
crawlers = ArchiveCrawler(line.split(" ", 1)[1], callback=exporter)
Expand Down
20 changes: 9 additions & 11 deletions multi_crawler/crawlers/youtube_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,10 @@ def _get_ytb_data(self, url):
# append a new session
self._ytb_sessions[time.time()] = session

info = session.extract_info(url, download=False)

if (
"categories" in info
and info["categories"] is not None
and not "Music" in info["categories"]
):
logging.info("Skipping non-music video: %s", info["title"])
try:
info = session.extract_info(url, download=False)
except Exception as e:
logging.error("Error extracting info from %s: %s", url, e)
return

logging.info("Found music video: %s", info["title"])
Expand Down Expand Up @@ -120,7 +116,7 @@ def _get_youtube_init_data(self, url):
context = None

try:
response = requests.get(url)
response = requests.get(url, timeout=10)
page_content = response.text

yt_init_data = page_content.split("var ytInitialData =")
Expand Down Expand Up @@ -149,7 +145,7 @@ def _get_youtube_init_data(self, url):
}
else:
print("cannot_get_init_data")
raise Exception("cannot_get_init_data")
raise Exception("Cannot get init data")

except Exception as ex:
print(ex)
Expand Down Expand Up @@ -199,7 +195,9 @@ def _next_page(self, next_page):
endpoint = f"{self.YOUTUBE_ENDPOINT}/youtubei/v1/search?key={next_page['nextPageToken']}"

try:
response = requests.post(endpoint, json=next_page["nextPageContext"])
response = requests.post(
endpoint, json=next_page["nextPageContext"], timeout=10
)
page_data = response.json()

item1 = page_data["onResponseReceivedCommands"][0][
Expand Down
41 changes: 0 additions & 41 deletions multi_crawler/poo_generator.py

This file was deleted.

10 changes: 0 additions & 10 deletions multi_crawler/scripts/poo_gen.sh

This file was deleted.

14 changes: 8 additions & 6 deletions multi_crawler/ytb_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,14 @@ def _handle_download_error(self, method_name: str, *args, **kwargs) -> Any:
method = getattr(self.ytdl, method_name)
try:
return method(*args, **kwargs)
except DownloadError:
logging.warning(
"DownloadError in %s, reinitializing with new proxy...", method_name
)
self._init_ytdl()
return self._handle_download_error(method_name, *args, **kwargs)
except DownloadError as e:
if "bot" in str(e).lower():
logging.warning(
"DownloadError in %s, reinitializing with new proxy...", method_name
)
self._init_ytdl()
return self._handle_download_error(method_name, *args, **kwargs)
raise e

def extract_info(self, *args, **kwargs):
"""Extracts information and handles DownloadError by reinitializing YoutubeDL."""
Expand Down

0 comments on commit 4ddf197

Please sign in to comment.