Skip to content

Commit

Permalink
return to docker with tor and po token
Browse files Browse the repository at this point in the history
  • Loading branch information
Jourdelune committed Sep 29, 2024
1 parent 326d1ae commit 595e30c
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 11 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,15 @@ The crawler for WaveGenAI
pip install -r requirements.txt
```

2. Install docker

## Usage

Run the proxy
```bash
docker run -d --rm -it -p 3128:3128 -p 4444:4444 -e "TOR_INSTANCES=40" zhaowde/rotating-tor-http-proxy
```

Run the crawler
```bash
python main.py --csv --input FILE.txt --overwrite --file_name FILE.csv --num_processes 40
Expand Down
3 changes: 1 addition & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
logging.basicConfig(level=logging.INFO)
load_dotenv(override=True)

# disable logging from ytb_session
logging.getLogger("multi_crawler.ytb_session").setLevel(logging.ERROR)
logging.getLogger("multi_crawler.ytb_session").setLevel(logging.CRITICAL)

if __name__ == "__main__":
argparser = argparse.ArgumentParser(
Expand Down
12 changes: 6 additions & 6 deletions multi_crawler/crawlers/youtube_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,7 @@ def __init__(
self._num_processes = num_processes

self.logging = logging.getLogger(__name__)
self._ytb_sessions = {
time.time(): YtbSession(
{"quiet": True, "noprogress": True, "no_warnings": True}, max_attemps=50
)
for _ in range(num_processes)
}
self._ytb_sessions = {}

# Create a thread pool with max 10 threads
self.executor = ThreadPoolExecutor(max_workers=num_processes)
Expand All @@ -54,6 +49,11 @@ def _get_ytb_data(self, url):
if url in self._videos:
return

if len(self._ytb_sessions) == 0:
self._ytb_sessions[time.time()] = YtbSession(
{"quiet": True, "noprogress": True, "no_warnings": True}, max_attemps=50
)

# get the oldest session
session = self._ytb_sessions.pop(min(self._ytb_sessions.keys()))
# append a new session
Expand Down
41 changes: 41 additions & 0 deletions multi_crawler/poo_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
Copied from https://github.com/iv-org/youtube-trusted-session-generator/tree/master
"""

import json
import sys

from nodriver import cdp, loop, start


async def main():
browser = await start(headless=False)
tab = browser.main_tab
tab.add_handler(cdp.network.RequestWillBeSent, send_handler)
page = await browser.get("https://www.youtube.com/embed/jNQXAC9IVRw")
await tab.wait(cdp.network.RequestWillBeSent)
await tab.sleep(10)
button_play = await tab.select("#movie_player")
await button_play.click()
await tab.wait(cdp.network.RequestWillBeSent)
await tab.sleep(30)


async def send_handler(event: cdp.network.RequestWillBeSent):
if "/youtubei/v1/player" in event.request.url:
post_data = event.request.post_data
post_data_json = json.loads(post_data)
visitor_data = post_data_json["context"]["client"]["visitorData"]
po_token = post_data_json["serviceIntegrityDimensions"]["poToken"]
print("visitor_data: " + visitor_data)
print("po_token: " + po_token)
if len(po_token) < 160:
print(
"[WARNING] there is a high chance that the potoken generated won't work. please try again on another internet connection."
)
sys.exit(0)
return


if __name__ == "__main__":
loop().run_until_complete(main())
9 changes: 9 additions & 0 deletions multi_crawler/scripts/poo_gen.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/sh

XVFB_WHD=${XVFB_WHD:-1280x720x16}

Xvfb :99 -ac -screen 0 $XVFB_WHD -nolisten tcp > /dev/null 2>&1 &
sleep 2

# Run python script on display 0
DISPLAY=:99 python multi_crawler/poo_generator.py
31 changes: 28 additions & 3 deletions multi_crawler/ytb_session.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import random
import subprocess
from typing import Any

import yt_dlp
Expand Down Expand Up @@ -46,13 +47,38 @@ def __init__(self, params: dict = None, max_attemps: int = -1, **kwargs):

def _gen_proxy(self) -> str:
"""Generates a random proxy string using Tor."""
creds = str(random.randint(10000, 10**9)) + ":" + "foobar"
return f"socks5://{creds}@127.0.0.1:9050"
# creds = str(random.randint(10000, 10**9)) + ":" + "foobar"
return "http://127.0.0.1:3128" # return f"socks5://{creds}@127.0.0.1:9050"

def _generate_poo(self):
logger.info("Generating poo token")
result = subprocess.run(
["./multi_crawler/scripts/poo_gen.sh"],
capture_output=True,
text=True,
check=True,
)

result = result.stdout.strip()

if "warning" in result:
logger.warning("Failed to generate poo token. Retrying...")
return self._generate_poo()

poo_token = result.split("po_token: ")[1].split("\n")[0]
logger.info("Generated poo token: %s", poo_token[:10] + "...")
return poo_token.strip()

def _init_ytdl(self):
"""Initializes or reinitializes the YoutubeDL instance with a new proxy."""
# Set a new proxy for each initialization
self.params["proxy"] = self._gen_proxy()

try:
self.params["po_token"] = f"web+{self._generate_poo()}"
except subprocess.CalledProcessError:
pass

self.ytdl = yt_dlp.YoutubeDL(self.params, **self.kwargs)
logger.info("Initialized YoutubeDL with proxy %s", self.params["proxy"])

Expand All @@ -67,7 +93,6 @@ def _handle_download_error(self, method_name: str, *args, **kwargs) -> Any:
"""

attempt = 0

while attempt < self._max_attempts or self._max_attempts == -1:
try:
method = getattr(self.ytdl, method_name)
Expand Down

0 comments on commit 595e30c

Please sign in to comment.