From bee9170c465edc92a4ad88b29d58d6afc6b4d1fe Mon Sep 17 00:00:00 2001 From: PROxZIMA Date: Wed, 14 Sep 2022 00:40:38 +0530 Subject: [PATCH 01/39] Add: Initial support for multithreading --- modules/crawler.py | 269 ++++++++++++++++++++++++++++----------------- torcrawl.py | 80 +++++++------- 2 files changed, 204 insertions(+), 145 deletions(-) diff --git a/modules/crawler.py b/modules/crawler.py index 333c98d..339bdc7 100644 --- a/modules/crawler.py +++ b/modules/crawler.py @@ -1,40 +1,87 @@ #!/usr/bin/python -import http.client import json import os import re import sys import time -import urllib.request +import warnings +from concurrent.futures import ThreadPoolExecutor, as_completed +from io import TextIOBase +from threading import Lock from urllib.parse import urljoin +import requests from bs4 import BeautifulSoup +from requests.models import Response + +warnings.filterwarnings("ignore", category=UserWarning, module="bs4") +requests.urllib3.disable_warnings() class Crawler: """Crawl input link upto depth (c_depth) with a pause of c_pause seconds. :param website: String: Website to crawl. + :param proxies: Dictionary: Dictionary mapping protocol or protocol and host to the URL of the proxy. :param c_depth: Integer: Depth of the crawl. - :param c_pause: Integer: Pause after every iteration. + :param c_pause: Float: Pause after every iteration. :param out_path: String: Output path to store extracted links. :param external: Boolean: True if external links are to be crawled else False. + :param thread: Integer: Number pages to visit (Threads) at the same time. :param logs: Boolean: True if logs are to be written else False. :param verbose: Boolean: True if crawl details are to be printed else False. :param exclusion: re String: Paths that you don't want to include. """ + network_file = "network_structure.json" + __header = { + "Accept-Encoding": "identity", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", + } + __port = 9050 + __proxies = { + "http": f"socks5h://127.0.0.1:{__port}", + "https": f"socks5h://127.0.0.1:{__port}", + } + def __init__( - self, website, c_depth, c_pause, out_path, external, logs, verbose, exclusion + self, + website, + c_depth, + c_pause, + out_path, + external, + thread, + logs, + verbose, + exclusion, ): self.website = website self.c_depth = c_depth self.c_pause = c_pause self.out_path = out_path - self.external = rf"{external}" + self.external = external + self.thread = thread self.logs = logs + self.log_path = os.path.join(self.out_path, "log.txt") self.verbose = verbose - self.exclusion = exclusion + self.exclusion = rf"{exclusion}" if exclusion else None + self.__executor = ThreadPoolExecutor(max_workers=min(32, self.thread)) + self.__lock = Lock() + self.__files = { + "extlinks": open( + os.path.join(self.out_path, "extlinks.txt"), "w+", encoding="UTF-8" + ), + "telephones": open( + os.path.join(self.out_path, "telephones.txt"), "w+", encoding="UTF-8" + ), + "mails": open( + os.path.join(self.out_path, "mails.txt"), "w+", encoding="UTF-8" + ), + "log_file": open(self.log_path, "w+", encoding="UTF-8"), + "network_structure": os.path.join(self.out_path, self.network_file), + "links": os.path.join(self.out_path, "links.txt"), + } def excludes(self, link): """Excludes links that are not required. @@ -55,21 +102,15 @@ def excludes(self, link): if link.startswith("http") and not link.startswith(self.website): if self.external is True: return False - file_path = self.out_path + "/extlinks.txt" - with open(file_path, "a+", encoding="UTF-8") as lst_file: - lst_file.write(str(link) + "\n") + self.__files["extlinks"].write(str(link) + "\n") return True # Telephone Number if link.startswith("tel:"): - file_path = self.out_path + "/telephones.txt" - with open(file_path, "a+", encoding="UTF-8") as lst_file: - lst_file.write(str(link) + "\n") + self.__files["telephones"].write(str(link) + "\n") return True # Mails if link.startswith("mailto:"): - file_path = self.out_path + "/mails.txt" - with open(file_path, "a+", encoding="UTF-8") as lst_file: - lst_file.write(str(link) + "\n") + self.__files["mails"].write(str(link) + "\n") return True # Type of files if re.search("^.*\\.(pdf|jpg|jpeg|png|gif|doc)$", link, re.IGNORECASE): @@ -88,6 +129,85 @@ def canonical(self, base, href): # For relative paths return urljoin(base, href) + def __get_tor_session(self): + session = requests.Session() + session.proxies = self.__proxies + session.headers.update(self.__header) + session.verify = False + return session + + def __crawl_link(self, item, session): + # Store the crawled link of an item + item_data = set() + html_page = Response + + try: + if item is not None: + # html_page = urllib.request.urlopen(item, timeout=10) + html_page = session.get(item, allow_redirects=True, timeout=10).text + except Exception as error: + if self.logs: + print(error) + return item, item_data + + # Keeps logs for every webpage visited. + if self.logs: + with self.__lock: + self.__files["log_file"].write(f"{str(item)}\n") + + try: + soup = BeautifulSoup(html_page, features="html.parser") + except Exception as _: + if self.logs: + print(f"## Soup Error Encountered:: to parse :: {item}") + return item, item_data + + # For each tag. + for link in soup.findAll("a"): + link = link.get("href") + + if self.excludes(link): + continue + + ver_link = self.canonical(item, link) + if ver_link is not None: + item_data.add(ver_link) + + # For each tag. + for link in soup.findAll("area"): + link = link.get("href") + + if self.excludes(link): + continue + + ver_link = self.canonical(item, link) + if ver_link is not None: + item_data.add(ver_link) + + # For each