Skip to content

Commit

Permalink
Merge pull request #18 from PROxZIMA/multithread-implementation
Browse files Browse the repository at this point in the history
Crawler multi-threaded implementation
  • Loading branch information
PROxZIMA authored Sep 23, 2022
2 parents 3750d15 + bed95bc commit bd3b96e
Show file tree
Hide file tree
Showing 18 changed files with 1,572 additions and 1,198 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,5 @@ www*/
assets/Dark*
assets/*.odt
assets/*2.pdf

.vscode
317 changes: 197 additions & 120 deletions README.md

Large diffs are not rendered by default.

Binary file added assets/logging.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
242 changes: 143 additions & 99 deletions torcrawl.py → darkspider.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,61 @@
#!/usr/bin/python
#!/usr/bin/env python3

"""
DarkSpider is a python script to crawl and extract (regular or onion)
webpages through TOR network.
usage: python torcrawl.py [options]
python torcrawl.py -u l0r3m1p5umD0lorS1t4m3t.onion
python torcrawl.py -v -w -u http://www.github.com -o github.htm
python torcrawl.py -v -u l0r3m1p5umD0lorS1t4m3t.onion -c -d 2 -p 5
python torcrawl.py -v -w -u http://www.github.com -c -d 2 -p 5 -e -f GitHub
usage: python darkspider.py [options]
python darkspider.py -u l0r3m1p5umD0lorS1t4m3t.onion
python darkspider.py -v -w -u http://www.github.com -o github.htm
python darkspider.py -v -u l0r3m1p5umD0lorS1t4m3t.onion -c -d 2 -p 5
python darkspider.py -v -w -u http://www.github.com -c -d 2 -p 5 -e -f GitHub
General:
-h, --help : Help
-g, --gui : Open with GUI backend.
-v, --verbose : Show more informations about the progress
-u, --url *.onion : URL of Webpage to crawl or extract
-n, --port number : Port number of TOR Proxy (default: 9050)
-w, --without : Without the use of Relay TOR
-s, --visualize : Visualize the graphs and insights from the crawled data
Extract:
-e, --extract : Extract page's code to terminal or file.
(Defualt: terminal)
-i, --input filename : Input file with URL(s) (seperated by line)
-o, --output [filename] : Output page(s) to file(s) (for one page)
-y, --yara : Yara keyword search page categorisation
-e, --extract : Extract page's code to terminal or file.
(Defualt: terminal)
-i, --input filename : Input file with URL(s) (seperated by line)
-o, --output filename : Output page(s) to file(s) (for one page)
-y, --yara 0|1 : Yara keyword search page categorisation
read in from /res folder. 0 search whole html object.
1 search only the text.
Crawl:
-c, --crawl : Crawl website (Default output on /links.txt)
-d, --cdepth : Set depth of crawl's travel (Default: 1)
-z, --exclusions : Paths that you don't want to include
-m, --simultaneous: How many pages to visit at the same time (TODO)
-p, --pause : The length of time the crawler will pause
(Default: 0)
-f, --folder : The root directory which will contain the
generated files
-l, --log : Log file with visited URLs and their response code.
-x, --external : Exclude external links while crawling a webpage
(Default: include all links)
GitHub: github.com/MikeMeliz/TorCrawl.py
-c, --crawl : Crawl website (Default output on /links.txt)
-d, --cdepth : Set depth of crawl's travel (Default: 1)
-z, --exclusions regexp : Paths that you don't want to include
-t, --thread number : How many pages to visit (Threads) at the same time
(Default: 16)
-p, --pause : The length of time the crawler will pause
(Default: 0)
-f, --folder : The root directory which will contain the
generated files
-l, --log : Log file with visited URLs and their response code.
-x, --external : Exclude external links while crawling a webpage
(Default: include all links)
GitHub: github.com/PROxZIMA/DarkSpider.py
License: GNU General Public License v3.0
"""

import argparse
import logging
import os
import socket
import sys
import warnings

import requests

import socks # noqa - pysocks
from modules.helper import get_tor_proxies, setup_custom_logger

try:
from gooey import Gooey, GooeyParser
Expand All @@ -58,13 +64,16 @@
except ModuleNotFoundError:
GOOEY_AVAILABLE = False

from modules.checker import check_ip, check_tor, extract_domain, folder, url_canon

# DarkSpider Modules
from modules.crawler import Crawler
from modules.extractor import extractor
from modules import Crawler
from modules.checker import check_ip, check_tor, extract_domain, folder, url_canon
from modules.extractor import Extractor
from modules.visualization import Visualization

warnings.filterwarnings("ignore", category=UserWarning, module="bs4")
logging.getLogger("urllib3").setLevel(logging.ERROR)
requests.urllib3.disable_warnings()

IGNORE_COMMAND = "--ignore-gooey"

# Remove IGNORE_COMMAND if present in arguments.
Expand All @@ -83,29 +92,6 @@
print("## Install Gooey with 'pip install Gooey' or remove '-g/--gui' argument")
sys.exit(2)

# Set socket and connection with TOR network
def connect_tor():
"""Connect to TOR via DNS resolution through a socket.
:return: None or HTTPError.
"""
try:
port = 9050
# Set socks proxy and wrap the urllib module
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", port)
socket.socket = socks.socksocket

# Perform DNS resolution through the socket
def getaddrinfo(*args): # noqa
return [(socket.AF_INET, socket.SOCK_STREAM, 6, "", (args[0], args[1]))]

socket.getaddrinfo = getaddrinfo # noqa
except socks.HTTPError as err:
error = sys.exc_info()[0]
print(
f"Error: {error} \n## Cannot establish connection with TOR\n"
f"HTTPError: {err}"
)


def GooeyConditional(flag, **kwargs):
"""Conditional decorator if GUI backend is available or not"""
Expand All @@ -120,14 +106,11 @@ def decorate(function):
def main():
"""Main method of DarkSpider application. Collects and parses arguments and
instructs the rest of the application on how to run.
:return: None
"""

# Get arguments with GooeyParser if available else argparse.
description = (
"DarkSpider.py is a python script to crawl and extract "
+ "(regular or onion) webpages through TOR network."
"DarkSpider.py is a python script to crawl and extract " + "(regular or onion) webpages through TOR network."
)
if GOOEY_AVAILABLE:
parser = GooeyParser(description=description)
Expand Down Expand Up @@ -161,12 +144,15 @@ def main():
action="store_true",
help="Visualize the graphs and insights from the crawled data",
)
parser.add_argument("-u", "--url", type=str, help="URL of webpage to crawl or extract")
parser.add_argument(
"-u", "--url", type=str, help="URL of webpage to crawl or extract"
)
parser.add_argument(
"-w", "--without", action="store_true", help="Without the use of Relay TOR"
"-n",
"--port",
type=int,
default=9050,
help="Port number of TOR Proxy (default: 9050)",
)
parser.add_argument("-w", "--without", action="store_true", help="Without the use of Relay TOR")

# Extract
parser.add_argument(
Expand Down Expand Up @@ -206,8 +192,8 @@ def main():
parser.add_argument(
"-p",
"--cpause",
type=int,
default=1,
type=float,
default=0,
help="The length of time the crawler will pause. (Default: 1 second)",
)
parser.add_argument(
Expand All @@ -216,12 +202,19 @@ def main():
type=str,
help="Regex path that is ignored while crawling",
)
parser.add_argument(
"-t",
"--thread",
type=int,
default=16,
help="How many pages to visit (Threads) at the same time (Default: 16)",
)
parser.add_argument(
"-l",
"--log",
action="store_true",
help="A save log will let you see which URLs were visited and their "
"response code",
action="store_false",
default=True,
help="A save log will let you see which URLs were visited and their " "response code",
)
parser.add_argument(
"-f",
Expand All @@ -240,9 +233,9 @@ def main():
"-y",
"--yara",
type=int,
default=0,
default=None,
help="Check for keywords and only scrape documents that contain a "
"match. 0 search whole html object. 1 search only the text. (Default: 0)",
"match. 0 search whole html object. 1 search only the text. (Default: None)",
)

if len(sys.argv) == 1:
Expand All @@ -254,62 +247,113 @@ def main():
if args.url is None and args.input is None:
parser.error("either argument -u/--url or -i/--input is required to proceed.")

if args.yara not in [0, 1]:
if args.port < 1 or 65535 < args.port:
parser.error("argument -n/--port: expected argument in between 1 to 65535.")

if args.yara and args.yara not in [0, 1]:
parser.error("argument -y/--yara: expected argument 0 or 1.")

# Connect to TOR
if args.without is False:
check_tor(args.verbose)
connect_tor()
if args.cdepth < 1:
parser.error("argument -d/--cdepth: expected argument greater than 1.")

if args.verbose:
check_ip()
print(("## URL: " + args.url))
if args.cpause < 0:
parser.error("argument -p/--cpause: expected argument greater than 0.")

if args.thread < 1:
parser.error("argument -t/--thread: expected argument greater than 1.")

website = ""
proxies = None
out_path = ""
canon, website = False, ""

# Canonicalization of web url and create path for output.
if args.url:
website = url_canon(args.url, args.verbose)
if args.folder is not None:
out_path = folder(args.folder, args.verbose)
else:
out_path = folder(extract_domain(website), args.verbose)
canon, website = url_canon(args.url)
out_path = folder(extract_domain(website))
elif args.folder:
out_path = folder(args.folder)

# Logger setup
crawlog = setup_custom_logger(
name="crawlog",
filename=os.path.join(out_path, "crawl.log"),
verbose_=args.verbose,
filelog=args.log,
argv=sys.argv,
)

# Connect to TOR
if not args.without:
check_tor(logger=crawlog)
proxies = get_tor_proxies(port=args.port)

if args.verbose:
check_ip(proxies=proxies, url=args.url, logger=crawlog, without_tor=args.without)

if args.crawl:
if canon:
crawlog.debug("URL fixed :: %s", website)
if out_path:
crawlog.debug("Folder created :: %s", out_path)

if args.crawl and website:
crawler = Crawler(
website,
args.cdepth,
args.cpause,
out_path,
args.external,
args.log,
args.verbose,
args.exclusion,
website=website,
proxies=proxies,
c_depth=args.cdepth,
c_pause=args.cpause,
out_path=out_path,
external=args.external,
exclusion=args.exclusion,
thread=args.thread,
logger=crawlog,
)
json_data = crawler.crawl()
print(f"## File created on {os.getcwd()}/{out_path}/network_structure.json")
crawlog.info(
"Network Structure created :: %s",
os.path.join(out_path, crawler.network_file),
)

if args.visualize:
obj = Visualization(
out_path + "/network_structure.json", out_path, args.verbose
json_file=os.path.join(out_path, crawler.network_file),
out_path=out_path,
logger=crawlog,
)
obj.indegree_plot()
obj.indegree_bar()
obj.outdegree_plot()
obj.outdegree_bar()
obj.eigenvector_centrality_bar()
obj.pagerank_bar()
obj.visualize()
# obj.visualize()

if args.extract:
input_file = out_path + "/links.txt"
extractor(website, args.crawl, args.output, input_file, out_path, args.yara)
else:
extractor(
website, args.crawl, args.output, args.input or "", out_path, args.yara
input_file = os.path.join(out_path, "links.txt")
extractor = Extractor(
website=website,
proxies=proxies,
crawl=args.crawl,
output_file=args.output,
input_file=input_file,
out_path=out_path,
thread=args.thread,
yara=args.yara,
logger=crawlog,
)
extract = extractor.extract()
elif args.input or website:
extractor = Extractor(
website=website,
proxies=proxies,
crawl=args.crawl,
output_file=args.output,
input_file=args.input or "",
out_path=out_path,
thread=args.thread,
yara=args.yara,
logger=crawlog,
)
extract = extractor.extract()


# Stub to call main method.
Expand Down
3 changes: 3 additions & 0 deletions modules/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from modules.crawler import Crawler
from modules.extractor import Extractor
from modules.visualization import Visualization
Loading

0 comments on commit bd3b96e

Please sign in to comment.