From c07c12c9b702b65a1f9110304e0bc78eb0adb080 Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 29 Sep 2024 09:09:24 +0200 Subject: [PATCH] reset ip each time detected as bot by youtube instead of using docker --- README.md | 10 +--------- multi_crawler/ytb_session.py | 24 +++++++++++++++++++----- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 19c00f1..7ae7c99 100644 --- a/README.md +++ b/README.md @@ -8,19 +8,11 @@ The crawler for WaveGenAI pip install -r requirements.txt ``` -2. Install Docker - ## Usage -Run the proxy -```bash -docker run -d --rm -it -p 3128:3128 -p 4444:4444 -e "TOR_INSTANCES=40" zhaowde/rotating-tor-http-proxy -``` - - Run the crawler ```bash -python main.py --csv --input FILE.txt --overwrite --file_name FILE.csv +python main.py --csv --input FILE.txt --overwrite --file_name FILE.csv --num_processes 40 ``` ## License diff --git a/multi_crawler/ytb_session.py b/multi_crawler/ytb_session.py index 362916b..0358698 100644 --- a/multi_crawler/ytb_session.py +++ b/multi_crawler/ytb_session.py @@ -1,4 +1,5 @@ import logging +import random from typing import Any import yt_dlp @@ -27,21 +28,32 @@ def error(self, msg): class YtbSession: """Wrapper class for YoutubeDL that uses Tor as a proxy.""" - def __init__(self, params: dict = None, max_attemps: int = 200, **kwargs): + def __init__(self, params: dict = None, max_attemps: int = -1, **kwargs): """Initializes the TorWrapper with optional parameters. Args: params (dict, optional): Optional parameters for YoutubeDL. Defaults to None. + max_attemps (int, optional): Maximum number of attempts to retry a failed download. Defaults to -1 (infinite). """ self.params = params if params is not None else {} self.kwargs = kwargs self._max_attempts = max_attemps - self.params["logger"] = SilentLogger() - self.params["proxy"] = "127.0.0.1:3128" + self._init_ytdl() + + def _gen_proxy(self) -> str: + """Generates a random proxy string using Tor.""" + creds = str(random.randint(10000, 10**9)) + ":" + "foobar" + return f"socks5://{creds}@127.0.0.1:9050" + def _init_ytdl(self): + """Initializes or reinitializes the YoutubeDL instance with a new proxy.""" + # Set a new proxy for each initialization + self.params["proxy"] = self._gen_proxy() + self.params["logger"] = SilentLogger() self.ytdl = yt_dlp.YoutubeDL(self.params, **self.kwargs) + logger.info("Initialized YoutubeDL with proxy %s", self.params["proxy"]) def _handle_download_error(self, method_name: str, *args, **kwargs) -> Any: """Handles DownloadError by reinitializing and retrying the method call in a loop. @@ -52,11 +64,12 @@ def _handle_download_error(self, method_name: str, *args, **kwargs) -> Any: Returns: any: The return value of the method call or raises the error if unrecoverable. """ - method = getattr(self.ytdl, method_name) + attempt = 0 - while attempt < self._max_attempts: + while attempt < self._max_attempts or self._max_attempts == -1: try: + method = getattr(self.ytdl, method_name) return method(*args, **kwargs) except DownloadError as e: if ( @@ -69,6 +82,7 @@ def _handle_download_error(self, method_name: str, *args, **kwargs) -> Any: attempt + 1, ) attempt += 1 + self._init_ytdl() else: raise e # If maximum attempts exceeded, raise DownloadError