Merge pull request #18 from PROxZIMA/multithread-implementation

Crawler multi-threaded implementation
PROxZIMA · Sep 23, 2022 · bd3b96e · bd3b96e
2 parents 3750d15 + bed95bc
commit bd3b96e
Show file tree

Hide file tree

Showing 18 changed files with 1,572 additions and 1,198 deletions.
diff --git a/.gitignore b/.gitignore
@@ -166,3 +166,5 @@ www*/
 assets/Dark*
 assets/*.odt
 assets/*2.pdf
+
+.vscode
diff --git a/README.md b/README.md
diff --git a/assets/logging.png b/assets/logging.png
diff --git a/torcrawl.py → darkspider.py b/torcrawl.py → darkspider.py
@@ -1,55 +1,61 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
+
 """
 DarkSpider is a python script to crawl and extract (regular or onion)
 webpages through TOR network.
 
-usage: python torcrawl.py [options]
-python torcrawl.py -u l0r3m1p5umD0lorS1t4m3t.onion
-python torcrawl.py -v -w -u http://www.github.com -o github.htm
-python torcrawl.py -v -u l0r3m1p5umD0lorS1t4m3t.onion -c -d 2 -p 5
-python torcrawl.py -v -w -u http://www.github.com -c -d 2 -p 5 -e -f GitHub
+usage: python darkspider.py [options]
+python darkspider.py -u l0r3m1p5umD0lorS1t4m3t.onion
+python darkspider.py -v -w -u http://www.github.com -o github.htm
+python darkspider.py -v -u l0r3m1p5umD0lorS1t4m3t.onion -c -d 2 -p 5
+python darkspider.py -v -w -u http://www.github.com -c -d 2 -p 5 -e -f GitHub
 
 General:
 -h, --help         : Help
 -g, --gui          : Open with GUI backend.
 -v, --verbose      : Show more informations about the progress
 -u, --url *.onion  : URL of Webpage to crawl or extract
+-n, --port number  : Port number of TOR Proxy (default: 9050)
 -w, --without      : Without the use of Relay TOR
 -s, --visualize    : Visualize the graphs and insights from the crawled data
 
 Extract:
--e, --extract           : Extract page's code to terminal or file.
-                          (Defualt: terminal)
--i, --input filename    : Input file with URL(s) (seperated by line)
--o, --output [filename] : Output page(s) to file(s) (for one page)
--y, --yara              : Yara keyword search page categorisation
+-e, --extract         : Extract page's code to terminal or file.
+                        (Defualt: terminal)
+-i, --input filename  : Input file with URL(s) (seperated by line)
+-o, --output filename : Output page(s) to file(s) (for one page)
+-y, --yara 0|1        : Yara keyword search page categorisation
                         read in from /res folder. 0 search whole html object.
                         1 search only the text.
 
 Crawl:
--c, --crawl       : Crawl website (Default output on /links.txt)
--d, --cdepth      : Set depth of crawl's travel (Default: 1)
--z, --exclusions  : Paths that you don't want to include
--m, --simultaneous: How many pages to visit at the same time (TODO)
--p, --pause       : The length of time the crawler will pause
-                    (Default: 0)
--f, --folder	  : The root directory which will contain the
-                    generated files
--l, --log         : Log file with visited URLs and their response code.
--x, --external    : Exclude external links while crawling a webpage
-                    (Default: include all links)
-
-GitHub: github.com/MikeMeliz/TorCrawl.py
+-c, --crawl             : Crawl website (Default output on /links.txt)
+-d, --cdepth            : Set depth of crawl's travel (Default: 1)
+-z, --exclusions regexp : Paths that you don't want to include
+-t, --thread number     : How many pages to visit (Threads) at the same time
+                          (Default: 16)
+-p, --pause             : The length of time the crawler will pause
+                          (Default: 0)
+-f, --folder	        : The root directory which will contain the
+                          generated files
+-l, --log               : Log file with visited URLs and their response code.
+-x, --external          : Exclude external links while crawling a webpage
+                          (Default: include all links)
+
+GitHub: github.com/PROxZIMA/DarkSpider.py
 License: GNU General Public License v3.0
 
 """
 
 import argparse
+import logging
 import os
-import socket
 import sys
+import warnings
+
+import requests
 
-import socks  # noqa - pysocks
+from modules.helper import get_tor_proxies, setup_custom_logger
 
 try:
     from gooey import Gooey, GooeyParser
@@ -58,13 +64,16 @@
 except ModuleNotFoundError:
     GOOEY_AVAILABLE = False
 
-from modules.checker import check_ip, check_tor, extract_domain, folder, url_canon
-
 # DarkSpider Modules
-from modules.crawler import Crawler
-from modules.extractor import extractor
+from modules import Crawler
+from modules.checker import check_ip, check_tor, extract_domain, folder, url_canon
+from modules.extractor import Extractor
 from modules.visualization import Visualization
 
+warnings.filterwarnings("ignore", category=UserWarning, module="bs4")
+logging.getLogger("urllib3").setLevel(logging.ERROR)
+requests.urllib3.disable_warnings()
+
 IGNORE_COMMAND = "--ignore-gooey"
 
 # Remove IGNORE_COMMAND if present in arguments.
@@ -83,29 +92,6 @@
     print("## Install Gooey with 'pip install Gooey' or remove '-g/--gui' argument")
     sys.exit(2)
 
-# Set socket and connection with TOR network
-def connect_tor():
-    """Connect to TOR via DNS resolution through a socket.
-    :return: None or HTTPError.
-    """
-    try:
-        port = 9050
-        # Set socks proxy and wrap the urllib module
-        socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", port)
-        socket.socket = socks.socksocket
-
-        # Perform DNS resolution through the socket
-        def getaddrinfo(*args):  # noqa
-            return [(socket.AF_INET, socket.SOCK_STREAM, 6, "", (args[0], args[1]))]
-
-        socket.getaddrinfo = getaddrinfo  # noqa
-    except socks.HTTPError as err:
-        error = sys.exc_info()[0]
-        print(
-            f"Error: {error} \n## Cannot establish connection with TOR\n"
-            f"HTTPError: {err}"
-        )
-
 
 def GooeyConditional(flag, **kwargs):
     """Conditional decorator if GUI backend is available or not"""
@@ -120,14 +106,11 @@ def decorate(function):
 def main():
     """Main method of DarkSpider application. Collects and parses arguments and
     instructs the rest of the application on how to run.
-
-    :return: None
     """
 
     # Get arguments with GooeyParser if available else argparse.
     description = (
-        "DarkSpider.py is a python script to crawl and extract "
-        + "(regular or onion) webpages through TOR network."
+        "DarkSpider.py is a python script to crawl and extract " + "(regular or onion) webpages through TOR network."
     )
     if GOOEY_AVAILABLE:
         parser = GooeyParser(description=description)
@@ -161,12 +144,15 @@ def main():
         action="store_true",
         help="Visualize the graphs and insights from the crawled data",
     )
+    parser.add_argument("-u", "--url", type=str, help="URL of webpage to crawl or extract")
     parser.add_argument(
-        "-u", "--url", type=str, help="URL of webpage to crawl or extract"
-    )
-    parser.add_argument(
-        "-w", "--without", action="store_true", help="Without the use of Relay TOR"
+        "-n",
+        "--port",
+        type=int,
+        default=9050,
+        help="Port number of TOR Proxy (default: 9050)",
     )
+    parser.add_argument("-w", "--without", action="store_true", help="Without the use of Relay TOR")
 
     # Extract
     parser.add_argument(
@@ -206,8 +192,8 @@ def main():
     parser.add_argument(
         "-p",
         "--cpause",
-        type=int,
-        default=1,
+        type=float,
+        default=0,
         help="The length of time the crawler will pause. (Default: 1 second)",
     )
     parser.add_argument(
@@ -216,12 +202,19 @@ def main():
         type=str,
         help="Regex path that is ignored while crawling",
     )
+    parser.add_argument(
+        "-t",
+        "--thread",
+        type=int,
+        default=16,
+        help="How many pages to visit (Threads) at the same time (Default: 16)",
+    )
     parser.add_argument(
         "-l",
         "--log",
-        action="store_true",
-        help="A save log will let you see which URLs were visited and their "
-        "response code",
+        action="store_false",
+        default=True,
+        help="A save log will let you see which URLs were visited and their " "response code",
     )
     parser.add_argument(
         "-f",
@@ -240,9 +233,9 @@ def main():
         "-y",
         "--yara",
         type=int,
-        default=0,
+        default=None,
         help="Check for keywords and only scrape documents that contain a "
-        "match. 0 search whole html object. 1 search only the text. (Default: 0)",
+        "match. 0 search whole html object. 1 search only the text. (Default: None)",
     )
 
     if len(sys.argv) == 1:
@@ -254,62 +247,113 @@ def main():
     if args.url is None and args.input is None:
         parser.error("either argument -u/--url or -i/--input is required to proceed.")
 
-    if args.yara not in [0, 1]:
+    if args.port < 1 or 65535 < args.port:
+        parser.error("argument -n/--port: expected argument in between 1 to 65535.")
+
+    if args.yara and args.yara not in [0, 1]:
         parser.error("argument -y/--yara: expected argument 0 or 1.")
 
-    # Connect to TOR
-    if args.without is False:
-        check_tor(args.verbose)
-        connect_tor()
+    if args.cdepth < 1:
+        parser.error("argument -d/--cdepth: expected argument greater than 1.")
 
-    if args.verbose:
-        check_ip()
-        print(("## URL: " + args.url))
+    if args.cpause < 0:
+        parser.error("argument -p/--cpause: expected argument greater than 0.")
+
+    if args.thread < 1:
+        parser.error("argument -t/--thread: expected argument greater than 1.")
 
-    website = ""
+    proxies = None
     out_path = ""
+    canon, website = False, ""
 
     # Canonicalization of web url and create path for output.
     if args.url:
-        website = url_canon(args.url, args.verbose)
-        if args.folder is not None:
-            out_path = folder(args.folder, args.verbose)
-        else:
-            out_path = folder(extract_domain(website), args.verbose)
+        canon, website = url_canon(args.url)
+        out_path = folder(extract_domain(website))
+    elif args.folder:
+        out_path = folder(args.folder)
+
+    # Logger setup
+    crawlog = setup_custom_logger(
+        name="crawlog",
+        filename=os.path.join(out_path, "crawl.log"),
+        verbose_=args.verbose,
+        filelog=args.log,
+        argv=sys.argv,
+    )
+
+    # Connect to TOR
+    if not args.without:
+        check_tor(logger=crawlog)
+        proxies = get_tor_proxies(port=args.port)
+
+    if args.verbose:
+        check_ip(proxies=proxies, url=args.url, logger=crawlog, without_tor=args.without)
 
-    if args.crawl:
+    if canon:
+        crawlog.debug("URL fixed :: %s", website)
+    if out_path:
+        crawlog.debug("Folder created :: %s", out_path)
+
+    if args.crawl and website:
         crawler = Crawler(
-            website,
-            args.cdepth,
-            args.cpause,
-            out_path,
-            args.external,
-            args.log,
-            args.verbose,
-            args.exclusion,
+            website=website,
+            proxies=proxies,
+            c_depth=args.cdepth,
+            c_pause=args.cpause,
+            out_path=out_path,
+            external=args.external,
+            exclusion=args.exclusion,
+            thread=args.thread,
+            logger=crawlog,
         )
         json_data = crawler.crawl()
-        print(f"## File created on {os.getcwd()}/{out_path}/network_structure.json")
+        crawlog.info(
+            "Network Structure created :: %s",
+            os.path.join(out_path, crawler.network_file),
+        )
 
         if args.visualize:
             obj = Visualization(
-                out_path + "/network_structure.json", out_path, args.verbose
+                json_file=os.path.join(out_path, crawler.network_file),
+                out_path=out_path,
+                logger=crawlog,
             )
             obj.indegree_plot()
             obj.indegree_bar()
             obj.outdegree_plot()
             obj.outdegree_bar()
             obj.eigenvector_centrality_bar()
             obj.pagerank_bar()
-            obj.visualize()
+            # obj.visualize()
 
         if args.extract:
-            input_file = out_path + "/links.txt"
-            extractor(website, args.crawl, args.output, input_file, out_path, args.yara)
-    else:
-        extractor(
-            website, args.crawl, args.output, args.input or "", out_path, args.yara
+            input_file = os.path.join(out_path, "links.txt")
+            extractor = Extractor(
+                website=website,
+                proxies=proxies,
+                crawl=args.crawl,
+                output_file=args.output,
+                input_file=input_file,
+                out_path=out_path,
+                thread=args.thread,
+                yara=args.yara,
+                logger=crawlog,
+            )
+            extract = extractor.extract()
+    elif args.input or website:
+        extractor = Extractor(
+            website=website,
+            proxies=proxies,
+            crawl=args.crawl,
+            output_file=args.output,
+            input_file=args.input or "",
+            out_path=out_path,
+            thread=args.thread,
+            yara=args.yara,
+            logger=crawlog,
         )
+        extract = extractor.extract()
 
 
 # Stub to call main method.

diff --git a/modules/__init__.py b/modules/__init__.py
@@ -0,0 +1,3 @@
+from modules.crawler import Crawler
+from modules.extractor import Extractor
+from modules.visualization import Visualization