From 4369bf2cb196759789dd696e3063af1374a57a69 Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Mon, 30 Sep 2024 21:37:31 +0200 Subject: [PATCH] remove custom youtube session managment and use docker --- multi_crawler/crawlers/youtube_crawler.py | 89 +++++++------- multi_crawler/poo_generator.py | 41 ------- multi_crawler/scripts/poo_gen.sh | 9 -- multi_crawler/ytb_session.py | 138 ---------------------- test.py | 20 ---- 5 files changed, 48 insertions(+), 249 deletions(-) delete mode 100644 multi_crawler/poo_generator.py delete mode 100755 multi_crawler/scripts/poo_gen.sh delete mode 100644 multi_crawler/ytb_session.py delete mode 100644 test.py diff --git a/multi_crawler/crawlers/youtube_crawler.py b/multi_crawler/crawlers/youtube_crawler.py index fba0fa7..483ddbb 100644 --- a/multi_crawler/crawlers/youtube_crawler.py +++ b/multi_crawler/crawlers/youtube_crawler.py @@ -1,13 +1,14 @@ -import json +import http import logging +import random import time from concurrent.futures import ThreadPoolExecutor from typing import Callable, Sequence -from pytubefix import Search +import pytubefix.exceptions +from pytubefix import Search, YouTube from ..models import Audio -from ..ytb_session import YtbSession from .crawlers import BaseCrawler @@ -25,15 +26,12 @@ def __init__( self._terms = terms self._callback = callback self._num_processes = num_processes - + self._terms = terms self.logging = logging.getLogger(__name__) - self._ytb_sessions = {} # Create a thread pool with max 10 threads self.executor = ThreadPoolExecutor(max_workers=num_processes) self.futures = set() - - self._search = Search(terms, {"params": "EgIwAQ%3D%3D"}) self._videos = set() def _manage_futures(self): @@ -49,29 +47,30 @@ def _get_ytb_data(self, url): if url in self._videos: return - if len(self._ytb_sessions) == 0: - self._ytb_sessions[time.time()] = YtbSession( - {"quiet": True, "noprogress": True, "no_warnings": True}, max_attemps=50 - ) - - # get the oldest session - session = self._ytb_sessions.pop(min(self._ytb_sessions.keys())) - # append a new session - self._ytb_sessions[time.time()] = session - - try: - info = session.extract_info(url, download=False) - except Exception as e: - logging.error("Error extracting info from %s: %s", url, e) - return - - logging.info("Found music video: %s", info["title"]) + success = False + while not success: + try: + video = YouTube( + url, + proxies={ + "http": "http://127.0.0.1:3128", + "https": "http://127.0.0.1:3128", + }, + ) + _ = video.title + success = True + except Exception as e: # pylint: disable=broad-except + if not isinstance(e, pytubefix.exceptions.BotDetection): + logging.error("Failed to get video data: %s", e) + return + + logging.info("Found music video: %s", video.title) audio = Audio( url=url, - title=info["title"], - author=info["channel"] if "channel" in info else "", - description=info["description"], - tags=info["tags"], + title=video.title, + author=video.author, + description=video.description, + tags=video.keywords, ) self._callback(audio) @@ -80,16 +79,24 @@ def _get_ytb_data(self, url): def crawl(self, *args, **kwargs) -> None: """Find and return URLs of Youtube videos based on search terms.""" - last_nbm_results = 0 - while len(self._search.videos) > last_nbm_results: - for result in self._search.videos[last_nbm_results:]: - url = f"{self.YOUTUBE_ENDPOINT}/watch?v={result.video_id}" - future = self.executor.submit(self._get_ytb_data, url) - self.futures.add(future) - - while len(self.futures) >= self._num_processes: - time.sleep(0.1) - self._manage_futures() - - last_nbm_results = len(self._search.videos) - self._search.get_next_results() + success = False + while not success: + try: + search = Search(self._terms, {"params": "EgIwAQ%3D%3D"}) + last_nbm_results = 0 + while len(search.videos) > last_nbm_results: + for result in search.videos[last_nbm_results:]: + url = f"{self.YOUTUBE_ENDPOINT}/watch?v={result.video_id}" + future = self.executor.submit(self._get_ytb_data, url) + self.futures.add(future) + + while len(self.futures) >= self._num_processes: + time.sleep(0.1) + self._manage_futures() + + last_nbm_results = len(search.videos) + search.get_next_results() + success = True + except Exception as e: + logging.error("Failed to get search results: %s", e) + time.sleep(5) diff --git a/multi_crawler/poo_generator.py b/multi_crawler/poo_generator.py deleted file mode 100644 index 04b2219..0000000 --- a/multi_crawler/poo_generator.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Copied from https://github.com/iv-org/youtube-trusted-session-generator/tree/master -""" - -import json -import sys - -from nodriver import cdp, loop, start - - -async def main(): - browser = await start(headless=False) - tab = browser.main_tab - tab.add_handler(cdp.network.RequestWillBeSent, send_handler) - page = await browser.get("https://www.youtube.com/embed/jNQXAC9IVRw") - await tab.wait(cdp.network.RequestWillBeSent) - await tab.sleep(10) - button_play = await tab.select("#movie_player") - await button_play.click() - await tab.wait(cdp.network.RequestWillBeSent) - await tab.sleep(30) - - -async def send_handler(event: cdp.network.RequestWillBeSent): - if "/youtubei/v1/player" in event.request.url: - post_data = event.request.post_data - post_data_json = json.loads(post_data) - visitor_data = post_data_json["context"]["client"]["visitorData"] - po_token = post_data_json["serviceIntegrityDimensions"]["poToken"] - print("visitor_data: " + visitor_data) - print("po_token: " + po_token) - if len(po_token) < 160: - print( - "[WARNING] there is a high chance that the potoken generated won't work. please try again on another internet connection." - ) - sys.exit(0) - return - - -if __name__ == "__main__": - loop().run_until_complete(main()) diff --git a/multi_crawler/scripts/poo_gen.sh b/multi_crawler/scripts/poo_gen.sh deleted file mode 100755 index 9646f75..0000000 --- a/multi_crawler/scripts/poo_gen.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/sh - -XVFB_WHD=${XVFB_WHD:-1280x720x16} - -Xvfb :99 -ac -screen 0 $XVFB_WHD -nolisten tcp > /dev/null 2>&1 & -sleep 2 - -# Run python script on display 0 -DISPLAY=:99 python multi_crawler/poo_generator.py \ No newline at end of file diff --git a/multi_crawler/ytb_session.py b/multi_crawler/ytb_session.py deleted file mode 100644 index 15e754d..0000000 --- a/multi_crawler/ytb_session.py +++ /dev/null @@ -1,138 +0,0 @@ -import logging -import random -import subprocess -from typing import Any - -import yt_dlp -from yt_dlp.utils import DownloadError - -# create a logger for the module with the module name -logger = logging.getLogger(__name__) - - -class SilentLogger: - """Silent logger class that does not log anything to avoid ram usage.""" - - def debug(self, msg): - pass - - def info(self, msg): - pass - - def warning(self, msg): - pass - - def error(self, msg): - pass - - -class YtbSession: - """Wrapper class for YoutubeDL that uses Tor as a proxy.""" - - def __init__(self, params: dict = None, max_attemps: int = -1, **kwargs): - """Initializes the TorWrapper with optional parameters. - - Args: - params (dict, optional): Optional parameters for YoutubeDL. Defaults to None. - max_attemps (int, optional): Maximum number of attempts to retry a failed download. Defaults to -1 (infinite). - - """ - self.params = params if params is not None else {} - self.kwargs = kwargs - self._max_attempts = max_attemps - - self.params["logger"] = SilentLogger() - - self._init_ytdl() - - def _gen_proxy(self) -> str: - """Generates a random proxy string using Tor.""" - # creds = str(random.randint(10000, 10**9)) + ":" + "foobar" - return "http://127.0.0.1:3128" # return f"socks5://{creds}@127.0.0.1:9050" - - def _generate_poo(self): - logger.info("Generating poo token") - result = subprocess.run( - ["./multi_crawler/scripts/poo_gen.sh"], - capture_output=True, - text=True, - check=True, - ) - - result = result.stdout.strip() - - if "warning" in result: - logger.warning("Failed to generate poo token. Retrying...") - return self._generate_poo() - - poo_token = result.split("po_token: ")[1].split("\n")[0] - logger.info("Generated poo token: %s", poo_token[:10] + "...") - return poo_token.strip() - - def _init_ytdl(self): - """Initializes or reinitializes the YoutubeDL instance with a new proxy.""" - # Set a new proxy for each initialization - self.params["proxy"] = self._gen_proxy() - - try: - self.params["po_token"] = f"web+{self._generate_poo()}" - except subprocess.CalledProcessError: - pass - - self.ytdl = yt_dlp.YoutubeDL(self.params, **self.kwargs) - logger.info("Initialized YoutubeDL with proxy %s", self.params["proxy"]) - - def _handle_download_error(self, method_name: str, *args, **kwargs) -> Any: - """Handles DownloadError by reinitializing and retrying the method call in a loop. - - Args: - method_name (str): The name of the method to call. - - Returns: - any: The return value of the method call or raises the error if unrecoverable. - """ - - attempt = 0 - while attempt < self._max_attempts or self._max_attempts == -1: - try: - method = getattr(self.ytdl, method_name) - return method(*args, **kwargs) - except DownloadError as e: - if ( - "sign in" in str(e).lower() - or "failed to extract any player response" in str(e).lower() - ): - logger.warning( - "DownloadError in %s, reinitializing with new proxy... Attempt %d", - method_name, - attempt + 1, - ) - attempt += 1 - self._init_ytdl() - else: - raise e - # If maximum attempts exceeded, raise DownloadError - raise DownloadError(f"Failed after {attempt} attempts") - - def extract_info(self, *args, **kwargs): - """Extracts information and handles DownloadError by reinitializing YoutubeDL.""" - return self._handle_download_error("extract_info", *args, **kwargs) - - def download(self, *args, **kwargs): - """Downloads a video and handles DownloadError by reinitializing YoutubeDL.""" - return self._handle_download_error("download", *args, **kwargs) - - def download_with_info_file(self, *args, **kwargs): - """Downloads a video with an info file, handles DownloadError by reinitializing.""" - return self._handle_download_error("download_with_info_file", *args, **kwargs) - - def __getattr__(self, name: str) -> Any: - """Redirects attribute access to the YoutubeDL instance. - - Args: - name (str): The name of the attribute to access. - - Returns: - any: The attribute value. - """ - return getattr(self.ytdl, name) diff --git a/test.py b/test.py deleted file mode 100644 index f404586..0000000 --- a/test.py +++ /dev/null @@ -1,20 +0,0 @@ -import requests - - -def get_tor_session(): - session = requests.session() - # Tor uses the 9050 port as the default socks port - session.proxies = { - "http": "socks5://127.0.0.1:9050", - } - return session - - -# Make a request through the Tor connection -# IP visible through Tor -session = get_tor_session() -print(session.get("http://httpbin.org/ip").text) -# Above should print an IP different than your public IP - -# Following prints your normal public IP -print(requests.get("http://httpbin.org/ip").text)