Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Extractor database #35

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 30 additions & 17 deletions darkspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,30 +35,32 @@
import logging
import os
import sys
import time
import warnings

import requests
from dotenv import load_dotenv

# DarkSpider Modules
from modules import Crawler
from modules.checker import check_ip, check_tor, extract_domain, folder, url_canon
from modules.extractor import Extractor
from modules.helper import HEADER, Colors, get_tor_proxies, gradient_print, setup_custom_logger
from modules.helper import HEADER, Colors, DatabaseManager, get_tor_proxies, gradient_print, setup_custom_logger
from modules.visualization import Visualization

warnings.filterwarnings("ignore", category=UserWarning, module=r"bs4|gooey")
logging.getLogger("urllib3").setLevel(logging.ERROR)
requests.urllib3.disable_warnings()


def main(gooey_available, baseParser):
def main(gooey_available: bool, base_parser: argparse.ArgumentParser):
"""Main method of DarkSpider application. Collects and parses arguments and
instructs the rest of the application on how to run.
"""

# Get arguments with GooeyParser if available else argparse.
description = "DarkSpider is a multithreaded crawler and extractor for regular or onion webpages through the TOR network, written in Python."
parser = baseParser(description=description, add_help=False)
parser: argparse.ArgumentParser = base_parser(description=description, add_help=False)

# Required
required_group = parser.add_argument_group("Required Options", "Either argument -u/--url or -i/--input is required")
Expand Down Expand Up @@ -223,7 +225,6 @@ def main(gooey_available, baseParser):

args = parser.parse_args()

print(args.pause)
if args.url is None and args.input is None:
parser.error("either argument -u/--url or -i/--input is required to proceed.")

Expand All @@ -249,7 +250,7 @@ def main(gooey_available, baseParser):
# Canonicalization of web url and create path for output.
if args.url:
canon, website = url_canon(args.url)
out_path = extract_domain(website)
out_path = f"{extract_domain(website)}.{int(time.time())}"
elif args.folder:
out_path = args.folder

Expand Down Expand Up @@ -277,6 +278,14 @@ def main(gooey_available, baseParser):
if out_path:
crawlog.debug("Folder created :: %s", out_path)

try:
load_dotenv()
db = DatabaseManager(
out_path, os.environ.get("NEO4J_SERVER"), os.environ.get("NEO4J_USER"), os.environ.get("NEO4J_PASSWORD")
)
except Exception as e:
crawlog.error("Error :: Failed to create graph client", exc_info=e)
return
if args.Crawl and website:
crawler = Crawler(
website=website,
Expand All @@ -287,17 +296,15 @@ def main(gooey_available, baseParser):
external=getattr(args, "External links"),
exclusion=args.exclusion,
thread=args.thread,
db=db,
logger=crawlog,
)
json_data = crawler.crawl()
crawlog.info(
"Network Structure created :: %s",
os.path.join(out_path, crawler.network_file),
)
crawlog.info("Crawling completed successfully")

if args.Visualize:
obj = Visualization(
json_file=os.path.join(out_path, crawler.network_file),
json_data=json_data,
out_path=out_path,
logger=crawlog,
)
Expand All @@ -311,6 +318,7 @@ def main(gooey_available, baseParser):

if args.Extract:
input_file = os.path.join(out_path, "links.txt")
# Input file is present and Craling is done :: Cinex
extractor = Extractor(
website=website,
proxies=proxies,
Expand All @@ -319,11 +327,15 @@ def main(gooey_available, baseParser):
input_file=input_file,
out_path=out_path,
thread=args.thread,
db=db,
yara=args.yara,
logger=crawlog,
)
extract = extractor.extract()
dataset_path = extractor.extract()
elif args.input or website:
# Input file is present but Crawling is not done (O/P to terminal) :: Terminex
# No input file so extract the website to output file :: Outex
# Even output file is not there then O/P to terminal :: Termex
extractor = Extractor(
website=website,
proxies=proxies,
Expand All @@ -332,16 +344,17 @@ def main(gooey_available, baseParser):
input_file=args.input or "",
out_path=out_path,
thread=args.thread,
db=db,
yara=args.yara,
logger=crawlog,
)
extract = extractor.extract()
dataset_path = extractor.extract()


GOOEY_AVAILABLE = False
PARSER = argparse.ArgumentParser

if not sys.stdout.isatty() or "-g" in sys.argv or "--gui" in sys.argv:
if "-g" in sys.argv or "--gui" in sys.argv:
# If we are not attached to a terminal or CLI includes -g/--gui, use Gooey
try:
from gooey import Gooey, GooeyParser
Expand All @@ -354,7 +367,7 @@ def main(gooey_available, baseParser):
program_name="DarkSpider",
image_dir="assets",
monospace_display=True,
tabbed_groups=False,
tabbed_groups=True,
menu=[
{
"name": "File",
Expand All @@ -367,7 +380,7 @@ def main(gooey_available, baseParser):
"version": "2.1.0",
"copyright": "2023",
"website": "https://proxzima.dev/DarkSpider/",
"developer": "https://github.com/PROxZIMA, https://github.com/knightster0804, https://github.com/r0nl, https://github.com/ytatiya3",
"developer": "https://github.com/PROxZIMA \nhttps://github.com/knightster0804 \nhttps://github.com/r0nl \nhttps://github.com/ytatiya3",
"license": "GNU General Public License v3.0",
},
{
Expand All @@ -390,7 +403,7 @@ def main(gooey_available, baseParser):
f"[ {Colors.BLUE}INFO {Colors.RESET} ] Install Gooey with 'pip install Gooey' or remove '-g/--gui' argument"
)
sys.exit(2)
else:
elif "-v" in sys.argv or "--verbose" in sys.argv:
os.system("cls" if os.name == "nt" else "clear")

gradient_print(
Expand All @@ -402,4 +415,4 @@ def main(gooey_available, baseParser):

# Stub to call main method.
if __name__ == "__main__":
main(gooey_available=GOOEY_AVAILABLE, baseParser=PARSER)
main(gooey_available=GOOEY_AVAILABLE, base_parser=PARSER)
8 changes: 4 additions & 4 deletions docs/contribute.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ $ bundle exec jekyll serve -c _config_dev.yml --livereload --open-url
```bash
$ pip install -r requirements_dev.txt
```
- Before committing, make sure to run all the test cases.
- Module-specific test case using

```bash
$ coverage run -m pytest -q --tb=short modules/tests/
$ pytest -q --tb=short modules/tests/test_extractor.py::TestCheckerFunctions::test_outex_002
```
- Or a module-specific test case using
- Before committing, make sure to run all the test cases.

```bash
$ pytest -q --tb=short modules/tests/test_extractor.py::TestCheckerFunctions::test_outex_002
$ coverage run -m pytest -q --tb=short modules/tests/
```
- Check code coverage

Expand Down
20 changes: 20 additions & 0 deletions docs/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,26 @@ $ git clone https://github.com/PROxZIMA/DarkSpider.git
### Dependencies
You'll also need to install dependencies:

- [`Neo4j`](https://neo4j.com/) :: For desktop app, see the [official installation](https://neo4j.com/download-center/#desktop) docs
- Open Neo4j desktop application.
- New > Create project > Add > Local DBMS > Enter name `Graph DBMS` and password `<<password>>` > Create > Start.
- Create an [`APOC` config file](https://neo4j.com/docs/apoc/current/config/) with the following content.

```ruby
apoc.export.file.enabled=true
apoc.import.file.use_neo4j_config=false
```
- Select project > Click `Graph DBMS` > Plugins pane > `APOC` > Install and Restart.
- Wait for the database to start then open the Neo4j Browser.
- Run `:server status` and note down `<<user>>` and `<<server_uri>>`.
- Create a new `.env` file in the root the project directory with the following content.

```ruby
NEO4J_SERVER=server_uri
NEO4J_USER=user
NEO4J_PASSWORD=password
```

- [`wxPython`](https://wxpython.org/) :: For Linux, see the [official installation](https://wxpython.org/pages/downloads/index.html) docs

```shell
Expand Down
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ permalink: /
DarkSpider is a multithreaded crawler and extractor for regular or onion webpages through the TOR network, written in Python.
{: .fs-6 .fw-300 }

[Get started now](Getting-Started){: .btn .btn-primary .fs-5 .mb-4 .mb-md-0 .mr-2 } [View it on GitHub](https://github.com/PROxZIMA/DarkSpider/){: .btn .fs-5 .mb-4 .mb-md-0 }
[Get started now](getting-started){: .btn .btn-primary .fs-5 .mb-4 .mb-md-0 .mr-2 } [View it on GitHub](https://github.com/PROxZIMA/DarkSpider/){: .btn .fs-5 .mb-4 .mb-md-0 }

{: .warning }
> Crawling is not illegal, but violating copyright is. It’s always best to double check a website’s T&C before crawling them. Some websites set up what’s called `robots.txt` to tell crawlers not to visit those pages. This crawler will allow you to go around this, but we always recommend respecting `robots.txt`.
Expand Down
Loading