From 4369bf2cb196759789dd696e3063af1374a57a69 Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Mon, 30 Sep 2024 21:37:31 +0200
Subject: [PATCH] remove custom youtube session managment and use docker

---
 multi_crawler/crawlers/youtube_crawler.py |  89 +++++++-------
 multi_crawler/poo_generator.py            |  41 -------
 multi_crawler/scripts/poo_gen.sh          |   9 --
 multi_crawler/ytb_session.py              | 138 ----------------------
 test.py                                   |  20 ----
 5 files changed, 48 insertions(+), 249 deletions(-)
 delete mode 100644 multi_crawler/poo_generator.py
 delete mode 100755 multi_crawler/scripts/poo_gen.sh
 delete mode 100644 multi_crawler/ytb_session.py
 delete mode 100644 test.py

diff --git a/multi_crawler/crawlers/youtube_crawler.py b/multi_crawler/crawlers/youtube_crawler.py
index fba0fa7..483ddbb 100644
--- a/multi_crawler/crawlers/youtube_crawler.py
+++ b/multi_crawler/crawlers/youtube_crawler.py
@@ -1,13 +1,14 @@
-import json
+import http
 import logging
+import random
 import time
 from concurrent.futures import ThreadPoolExecutor
 from typing import Callable, Sequence
 
-from pytubefix import Search
+import pytubefix.exceptions
+from pytubefix import Search, YouTube
 
 from ..models import Audio
-from ..ytb_session import YtbSession
 from .crawlers import BaseCrawler
 
 
@@ -25,15 +26,12 @@ def __init__(
         self._terms = terms
         self._callback = callback
         self._num_processes = num_processes
-
+        self._terms = terms
         self.logging = logging.getLogger(__name__)
-        self._ytb_sessions = {}
 
         # Create a thread pool with max 10 threads
         self.executor = ThreadPoolExecutor(max_workers=num_processes)
         self.futures = set()
-
-        self._search = Search(terms, {"params": "EgIwAQ%3D%3D"})
         self._videos = set()
 
     def _manage_futures(self):
@@ -49,29 +47,30 @@ def _get_ytb_data(self, url):
         if url in self._videos:
             return
 
-        if len(self._ytb_sessions) == 0:
-            self._ytb_sessions[time.time()] = YtbSession(
-                {"quiet": True, "noprogress": True, "no_warnings": True}, max_attemps=50
-            )
-
-        # get the oldest session
-        session = self._ytb_sessions.pop(min(self._ytb_sessions.keys()))
-        # append a new session
-        self._ytb_sessions[time.time()] = session
-
-        try:
-            info = session.extract_info(url, download=False)
-        except Exception as e:
-            logging.error("Error extracting info from %s: %s", url, e)
-            return
-
-        logging.info("Found music video: %s", info["title"])
+        success = False
+        while not success:
+            try:
+                video = YouTube(
+                    url,
+                    proxies={
+                        "http": "http://127.0.0.1:3128",
+                        "https": "http://127.0.0.1:3128",
+                    },
+                )
+                _ = video.title
+                success = True
+            except Exception as e:  # pylint: disable=broad-except
+                if not isinstance(e, pytubefix.exceptions.BotDetection):
+                    logging.error("Failed to get video data: %s", e)
+                    return
+
+        logging.info("Found music video: %s", video.title)
         audio = Audio(
             url=url,
-            title=info["title"],
-            author=info["channel"] if "channel" in info else "",
-            description=info["description"],
-            tags=info["tags"],
+            title=video.title,
+            author=video.author,
+            description=video.description,
+            tags=video.keywords,
         )
 
         self._callback(audio)
@@ -80,16 +79,24 @@ def _get_ytb_data(self, url):
     def crawl(self, *args, **kwargs) -> None:
         """Find and return URLs of Youtube videos based on search terms."""
 
-        last_nbm_results = 0
-        while len(self._search.videos) > last_nbm_results:
-            for result in self._search.videos[last_nbm_results:]:
-                url = f"{self.YOUTUBE_ENDPOINT}/watch?v={result.video_id}"
-                future = self.executor.submit(self._get_ytb_data, url)
-                self.futures.add(future)
-
-                while len(self.futures) >= self._num_processes:
-                    time.sleep(0.1)
-                    self._manage_futures()
-
-            last_nbm_results = len(self._search.videos)
-            self._search.get_next_results()
+        success = False
+        while not success:
+            try:
+                search = Search(self._terms, {"params": "EgIwAQ%3D%3D"})
+                last_nbm_results = 0
+                while len(search.videos) > last_nbm_results:
+                    for result in search.videos[last_nbm_results:]:
+                        url = f"{self.YOUTUBE_ENDPOINT}/watch?v={result.video_id}"
+                        future = self.executor.submit(self._get_ytb_data, url)
+                        self.futures.add(future)
+
+                        while len(self.futures) >= self._num_processes:
+                            time.sleep(0.1)
+                            self._manage_futures()
+
+                    last_nbm_results = len(search.videos)
+                    search.get_next_results()
+                success = True
+            except Exception as e:
+                logging.error("Failed to get search results: %s", e)
+                time.sleep(5)
diff --git a/multi_crawler/poo_generator.py b/multi_crawler/poo_generator.py
deleted file mode 100644
index 04b2219..0000000
--- a/multi_crawler/poo_generator.py
+++ /dev/null
@@ -1,41 +0,0 @@
-"""
-Copied from https://github.com/iv-org/youtube-trusted-session-generator/tree/master
-"""
-
-import json
-import sys
-
-from nodriver import cdp, loop, start
-
-
-async def main():
-    browser = await start(headless=False)
-    tab = browser.main_tab
-    tab.add_handler(cdp.network.RequestWillBeSent, send_handler)
-    page = await browser.get("https://www.youtube.com/embed/jNQXAC9IVRw")
-    await tab.wait(cdp.network.RequestWillBeSent)
-    await tab.sleep(10)
-    button_play = await tab.select("#movie_player")
-    await button_play.click()
-    await tab.wait(cdp.network.RequestWillBeSent)
-    await tab.sleep(30)
-
-
-async def send_handler(event: cdp.network.RequestWillBeSent):
-    if "/youtubei/v1/player" in event.request.url:
-        post_data = event.request.post_data
-        post_data_json = json.loads(post_data)
-        visitor_data = post_data_json["context"]["client"]["visitorData"]
-        po_token = post_data_json["serviceIntegrityDimensions"]["poToken"]
-        print("visitor_data: " + visitor_data)
-        print("po_token: " + po_token)
-        if len(po_token) < 160:
-            print(
-                "[WARNING] there is a high chance that the potoken generated won't work. please try again on another internet connection."
-            )
-        sys.exit(0)
-    return
-
-
-if __name__ == "__main__":
-    loop().run_until_complete(main())
diff --git a/multi_crawler/scripts/poo_gen.sh b/multi_crawler/scripts/poo_gen.sh
deleted file mode 100755
index 9646f75..0000000
--- a/multi_crawler/scripts/poo_gen.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/sh
-
-XVFB_WHD=${XVFB_WHD:-1280x720x16}
-
-Xvfb :99 -ac -screen 0 $XVFB_WHD -nolisten tcp > /dev/null 2>&1 &
-sleep 2
-
-# Run python script on display 0
-DISPLAY=:99 python multi_crawler/poo_generator.py
\ No newline at end of file
diff --git a/multi_crawler/ytb_session.py b/multi_crawler/ytb_session.py
deleted file mode 100644
index 15e754d..0000000
--- a/multi_crawler/ytb_session.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import logging
-import random
-import subprocess
-from typing import Any
-
-import yt_dlp
-from yt_dlp.utils import DownloadError
-
-# create a logger for the module with the module name
-logger = logging.getLogger(__name__)
-
-
-class SilentLogger:
-    """Silent logger class that does not log anything to avoid ram usage."""
-
-    def debug(self, msg):
-        pass
-
-    def info(self, msg):
-        pass
-
-    def warning(self, msg):
-        pass
-
-    def error(self, msg):
-        pass
-
-
-class YtbSession:
-    """Wrapper class for YoutubeDL that uses Tor as a proxy."""
-
-    def __init__(self, params: dict = None, max_attemps: int = -1, **kwargs):
-        """Initializes the TorWrapper with optional parameters.
-
-        Args:
-            params (dict, optional): Optional parameters for YoutubeDL. Defaults to None.
-            max_attemps (int, optional): Maximum number of attempts to retry a failed download. Defaults to -1 (infinite).
-
-        """
-        self.params = params if params is not None else {}
-        self.kwargs = kwargs
-        self._max_attempts = max_attemps
-
-        self.params["logger"] = SilentLogger()
-
-        self._init_ytdl()
-
-    def _gen_proxy(self) -> str:
-        """Generates a random proxy string using Tor."""
-        # creds = str(random.randint(10000, 10**9)) + ":" + "foobar"
-        return "http://127.0.0.1:3128"  # return f"socks5://{creds}@127.0.0.1:9050"
-
-    def _generate_poo(self):
-        logger.info("Generating poo token")
-        result = subprocess.run(
-            ["./multi_crawler/scripts/poo_gen.sh"],
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-
-        result = result.stdout.strip()
-
-        if "warning" in result:
-            logger.warning("Failed to generate poo token. Retrying...")
-            return self._generate_poo()
-
-        poo_token = result.split("po_token: ")[1].split("\n")[0]
-        logger.info("Generated poo token: %s", poo_token[:10] + "...")
-        return poo_token.strip()
-
-    def _init_ytdl(self):
-        """Initializes or reinitializes the YoutubeDL instance with a new proxy."""
-        # Set a new proxy for each initialization
-        self.params["proxy"] = self._gen_proxy()
-
-        try:
-            self.params["po_token"] = f"web+{self._generate_poo()}"
-        except subprocess.CalledProcessError:
-            pass
-
-        self.ytdl = yt_dlp.YoutubeDL(self.params, **self.kwargs)
-        logger.info("Initialized YoutubeDL with proxy %s", self.params["proxy"])
-
-    def _handle_download_error(self, method_name: str, *args, **kwargs) -> Any:
-        """Handles DownloadError by reinitializing and retrying the method call in a loop.
-
-        Args:
-            method_name (str): The name of the method to call.
-
-        Returns:
-            any: The return value of the method call or raises the error if unrecoverable.
-        """
-
-        attempt = 0
-        while attempt < self._max_attempts or self._max_attempts == -1:
-            try:
-                method = getattr(self.ytdl, method_name)
-                return method(*args, **kwargs)
-            except DownloadError as e:
-                if (
-                    "sign in" in str(e).lower()
-                    or "failed to extract any player response" in str(e).lower()
-                ):
-                    logger.warning(
-                        "DownloadError in %s, reinitializing with new proxy... Attempt %d",
-                        method_name,
-                        attempt + 1,
-                    )
-                    attempt += 1
-                    self._init_ytdl()
-                else:
-                    raise e
-        # If maximum attempts exceeded, raise DownloadError
-        raise DownloadError(f"Failed after {attempt} attempts")
-
-    def extract_info(self, *args, **kwargs):
-        """Extracts information and handles DownloadError by reinitializing YoutubeDL."""
-        return self._handle_download_error("extract_info", *args, **kwargs)
-
-    def download(self, *args, **kwargs):
-        """Downloads a video and handles DownloadError by reinitializing YoutubeDL."""
-        return self._handle_download_error("download", *args, **kwargs)
-
-    def download_with_info_file(self, *args, **kwargs):
-        """Downloads a video with an info file, handles DownloadError by reinitializing."""
-        return self._handle_download_error("download_with_info_file", *args, **kwargs)
-
-    def __getattr__(self, name: str) -> Any:
-        """Redirects attribute access to the YoutubeDL instance.
-
-        Args:
-            name (str): The name of the attribute to access.
-
-        Returns:
-            any: The attribute value.
-        """
-        return getattr(self.ytdl, name)
diff --git a/test.py b/test.py
deleted file mode 100644
index f404586..0000000
--- a/test.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import requests
-
-
-def get_tor_session():
-    session = requests.session()
-    # Tor uses the 9050 port as the default socks port
-    session.proxies = {
-        "http": "socks5://127.0.0.1:9050",
-    }
-    return session
-
-
-# Make a request through the Tor connection
-# IP visible through Tor
-session = get_tor_session()
-print(session.get("http://httpbin.org/ip").text)
-# Above should print an IP different than your public IP
-
-# Following prints your normal public IP
-print(requests.get("http://httpbin.org/ip").text)