diff --git a/darkspider.py b/darkspider.py index 4354aab..7206f73 100755 --- a/darkspider.py +++ b/darkspider.py @@ -35,15 +35,17 @@ import logging import os import sys +import time import warnings import requests +from dotenv import load_dotenv # DarkSpider Modules from modules import Crawler from modules.checker import check_ip, check_tor, extract_domain, folder, url_canon from modules.extractor import Extractor -from modules.helper import HEADER, Colors, get_tor_proxies, gradient_print, setup_custom_logger +from modules.helper import HEADER, Colors, DatabaseManager, get_tor_proxies, gradient_print, setup_custom_logger from modules.visualization import Visualization warnings.filterwarnings("ignore", category=UserWarning, module=r"bs4|gooey") @@ -51,14 +53,14 @@ requests.urllib3.disable_warnings() -def main(gooey_available, baseParser): +def main(gooey_available: bool, base_parser: argparse.ArgumentParser): """Main method of DarkSpider application. Collects and parses arguments and instructs the rest of the application on how to run. """ # Get arguments with GooeyParser if available else argparse. description = "DarkSpider is a multithreaded crawler and extractor for regular or onion webpages through the TOR network, written in Python." - parser = baseParser(description=description, add_help=False) + parser: argparse.ArgumentParser = base_parser(description=description, add_help=False) # Required required_group = parser.add_argument_group("Required Options", "Either argument -u/--url or -i/--input is required") @@ -223,7 +225,6 @@ def main(gooey_available, baseParser): args = parser.parse_args() - print(args.pause) if args.url is None and args.input is None: parser.error("either argument -u/--url or -i/--input is required to proceed.") @@ -249,7 +250,7 @@ def main(gooey_available, baseParser): # Canonicalization of web url and create path for output. if args.url: canon, website = url_canon(args.url) - out_path = extract_domain(website) + out_path = f"{extract_domain(website)}.{int(time.time())}" elif args.folder: out_path = args.folder @@ -277,6 +278,14 @@ def main(gooey_available, baseParser): if out_path: crawlog.debug("Folder created :: %s", out_path) + try: + load_dotenv() + db = DatabaseManager( + out_path, os.environ.get("NEO4J_SERVER"), os.environ.get("NEO4J_USER"), os.environ.get("NEO4J_PASSWORD") + ) + except Exception as e: + crawlog.error("Error :: Failed to create graph client", exc_info=e) + return if args.Crawl and website: crawler = Crawler( website=website, @@ -287,17 +296,15 @@ def main(gooey_available, baseParser): external=getattr(args, "External links"), exclusion=args.exclusion, thread=args.thread, + db=db, logger=crawlog, ) json_data = crawler.crawl() - crawlog.info( - "Network Structure created :: %s", - os.path.join(out_path, crawler.network_file), - ) + crawlog.info("Crawling completed successfully") if args.Visualize: obj = Visualization( - json_file=os.path.join(out_path, crawler.network_file), + json_data=json_data, out_path=out_path, logger=crawlog, ) @@ -311,6 +318,7 @@ def main(gooey_available, baseParser): if args.Extract: input_file = os.path.join(out_path, "links.txt") + # Input file is present and Craling is done :: Cinex extractor = Extractor( website=website, proxies=proxies, @@ -319,11 +327,15 @@ def main(gooey_available, baseParser): input_file=input_file, out_path=out_path, thread=args.thread, + db=db, yara=args.yara, logger=crawlog, ) - extract = extractor.extract() + dataset_path = extractor.extract() elif args.input or website: + # Input file is present but Crawling is not done (O/P to terminal) :: Terminex + # No input file so extract the website to output file :: Outex + # Even output file is not there then O/P to terminal :: Termex extractor = Extractor( website=website, proxies=proxies, @@ -332,16 +344,17 @@ def main(gooey_available, baseParser): input_file=args.input or "", out_path=out_path, thread=args.thread, + db=db, yara=args.yara, logger=crawlog, ) - extract = extractor.extract() + dataset_path = extractor.extract() GOOEY_AVAILABLE = False PARSER = argparse.ArgumentParser -if not sys.stdout.isatty() or "-g" in sys.argv or "--gui" in sys.argv: +if "-g" in sys.argv or "--gui" in sys.argv: # If we are not attached to a terminal or CLI includes -g/--gui, use Gooey try: from gooey import Gooey, GooeyParser @@ -354,7 +367,7 @@ def main(gooey_available, baseParser): program_name="DarkSpider", image_dir="assets", monospace_display=True, - tabbed_groups=False, + tabbed_groups=True, menu=[ { "name": "File", @@ -367,7 +380,7 @@ def main(gooey_available, baseParser): "version": "2.1.0", "copyright": "2023", "website": "https://proxzima.dev/DarkSpider/", - "developer": "https://github.com/PROxZIMA, https://github.com/knightster0804, https://github.com/r0nl, https://github.com/ytatiya3", + "developer": "https://github.com/PROxZIMA \nhttps://github.com/knightster0804 \nhttps://github.com/r0nl \nhttps://github.com/ytatiya3", "license": "GNU General Public License v3.0", }, { @@ -390,7 +403,7 @@ def main(gooey_available, baseParser): f"[ {Colors.BLUE}INFO {Colors.RESET} ] Install Gooey with 'pip install Gooey' or remove '-g/--gui' argument" ) sys.exit(2) -else: +elif "-v" in sys.argv or "--verbose" in sys.argv: os.system("cls" if os.name == "nt" else "clear") gradient_print( @@ -402,4 +415,4 @@ def main(gooey_available, baseParser): # Stub to call main method. if __name__ == "__main__": - main(gooey_available=GOOEY_AVAILABLE, baseParser=PARSER) + main(gooey_available=GOOEY_AVAILABLE, base_parser=PARSER) diff --git a/docs/contribute.md b/docs/contribute.md index 554880a..a81bf9c 100644 --- a/docs/contribute.md +++ b/docs/contribute.md @@ -33,15 +33,15 @@ $ bundle exec jekyll serve -c _config_dev.yml --livereload --open-url ```bash $ pip install -r requirements_dev.txt ``` -- Before committing, make sure to run all the test cases. +- Module-specific test case using ```bash -$ coverage run -m pytest -q --tb=short modules/tests/ +$ pytest -q --tb=short modules/tests/test_extractor.py::TestCheckerFunctions::test_outex_002 ``` -- Or a module-specific test case using +- Before committing, make sure to run all the test cases. ```bash -$ pytest -q --tb=short modules/tests/test_extractor.py::TestCheckerFunctions::test_outex_002 +$ coverage run -m pytest -q --tb=short modules/tests/ ``` - Check code coverage diff --git a/docs/getting-started.md b/docs/getting-started.md index 45ec94e..c1fc913 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -15,6 +15,26 @@ $ git clone https://github.com/PROxZIMA/DarkSpider.git ### Dependencies You'll also need to install dependencies: +- [`Neo4j`](https://neo4j.com/) :: For desktop app, see the [official installation](https://neo4j.com/download-center/#desktop) docs + - Open Neo4j desktop application. + - New > Create project > Add > Local DBMS > Enter name `Graph DBMS` and password `<>` > Create > Start. + - Create an [`APOC` config file](https://neo4j.com/docs/apoc/current/config/) with the following content. + + ```ruby + apoc.export.file.enabled=true + apoc.import.file.use_neo4j_config=false + ``` + - Select project > Click `Graph DBMS` > Plugins pane > `APOC` > Install and Restart. + - Wait for the database to start then open the Neo4j Browser. + - Run `:server status` and note down `<>` and `<>`. + - Create a new `.env` file in the root the project directory with the following content. + + ```ruby + NEO4J_SERVER=server_uri + NEO4J_USER=user + NEO4J_PASSWORD=password + ``` + - [`wxPython`](https://wxpython.org/) :: For Linux, see the [official installation](https://wxpython.org/pages/downloads/index.html) docs ```shell diff --git a/docs/index.md b/docs/index.md index be07b39..1336402 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,7 +10,7 @@ permalink: / DarkSpider is a multithreaded crawler and extractor for regular or onion webpages through the TOR network, written in Python. {: .fs-6 .fw-300 } -[Get started now](Getting-Started){: .btn .btn-primary .fs-5 .mb-4 .mb-md-0 .mr-2 } [View it on GitHub](https://github.com/PROxZIMA/DarkSpider/){: .btn .fs-5 .mb-4 .mb-md-0 } +[Get started now](getting-started){: .btn .btn-primary .fs-5 .mb-4 .mb-md-0 .mr-2 } [View it on GitHub](https://github.com/PROxZIMA/DarkSpider/){: .btn .fs-5 .mb-4 .mb-md-0 } {: .warning } > Crawling is not illegal, but violating copyright is. It’s always best to double check a website’s T&C before crawling them. Some websites set up what’s called `robots.txt` to tell crawlers not to visit those pages. This crawler will allow you to go around this, but we always recommend respecting `robots.txt`. diff --git a/modules/crawler.py b/modules/crawler.py index f1b2e37..1c76a22 100644 --- a/modules/crawler.py +++ b/modules/crawler.py @@ -1,11 +1,11 @@ -import json -import os import re import time +from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from io import TextIOBase from logging import Logger -from typing import Dict, List, Tuple, Union +from shutil import get_terminal_size +from typing import Dict, List, Set, Tuple, Union from urllib.parse import urljoin import requests @@ -13,7 +13,7 @@ from requests.models import Response from modules.checker import url_canon -from modules.helper import get_requests_header +from modules.helper import DatabaseManager, get_requests_header class Crawler: @@ -28,10 +28,10 @@ class Crawler: external: True if external links are to be crawled else False. exclusion: Paths that you don't want to include. thread: Number pages to visit (Threads) at the same time. + db: Neo4j :class:`DatabaseManager` object logger: A logger object to log the output. """ - network_file = "network_structure.json" __headers = get_requests_header() def __init__( @@ -44,6 +44,7 @@ def __init__( external: bool, exclusion: str, thread: int, + db: DatabaseManager, logger: Logger, ): self.website = website @@ -54,16 +55,11 @@ def __init__( self.external = external self.exclusion = rf"{exclusion}" if exclusion else None self.thread = thread + self.db = db self.logger = logger + self.extras = defaultdict(lambda: defaultdict(list)) self.__executor = ThreadPoolExecutor(max_workers=min(32, self.thread)) - self.__files = { - "extlinks": open(os.path.join(self.out_path, "extlinks.txt"), "w+", encoding="UTF-8"), - "telephones": open(os.path.join(self.out_path, "telephones.txt"), "w+", encoding="UTF-8"), - "mails": open(os.path.join(self.out_path, "mails.txt"), "w+", encoding="UTF-8"), - "network_structure": os.path.join(self.out_path, self.network_file), - "links": os.path.join(self.out_path, "links.txt"), - } def __get_tor_session(self) -> requests.Session: """Get a new session with Tor proxies. @@ -77,7 +73,7 @@ def __get_tor_session(self) -> requests.Session: session.verify = False return session - def excludes(self, link: str) -> bool: + def excludes(self, link: str, parent_url: str) -> bool: """Excludes links that are not required. Args: @@ -98,15 +94,15 @@ def excludes(self, link: str) -> bool: if link.startswith("http") and not link.startswith(self.website): if self.external: return False - self.__files["extlinks"].write(str(link) + "\n") + self.extras["Extlink"][parent_url].append(link) return True # Telephone Number if link.startswith("tel:"): - self.__files["telephones"].write(str(link) + "\n") + self.extras["Telephone"][parent_url].append(link) return True # Mails if link.startswith("mailto:"): - self.__files["mails"].write(str(link) + "\n") + self.extras["Mail"][parent_url].append(link) return True # Type of files if re.search("^.*\\.(pdf|jpg|jpeg|png|gif|doc|js|css)$", link, re.IGNORECASE): @@ -166,7 +162,7 @@ def __crawl_link( for link in soup.findAll("a"): link = link.get("href") - if self.excludes(link): + if self.excludes(link, url): continue ver_link = self.canonical(url, link) @@ -177,7 +173,7 @@ def __crawl_link( for link in soup.findAll("area"): link = link.get("href") - if self.excludes(link): + if self.excludes(link, url): continue ver_link = self.canonical(url, link) @@ -201,7 +197,7 @@ def crawl(self) -> Dict[str, List[str]]: """ ord_lst = set([self.website]) old_level = [self.website] - cur_level = set() + cur_level: Set[str] = set() self.logger.info( f"Crawler started from {self.website} with {self.depth} depth, " @@ -209,21 +205,22 @@ def crawl(self) -> Dict[str, List[str]]: f"Thread{'s'[:self.thread^1]}. Excluding '{self.exclusion}' links." ) - # Json dictionary - json_data = {} # Depth for index in range(0, int(self.depth)): session = self.__get_tor_session() # Sumbit all the links to the thread pool - futures = [ - self.__executor.submit(self.__crawl_link, url=url, session=session) - for url in old_level - if url not in json_data - ] + futures = [self.__executor.submit(self.__crawl_link, url=url, session=session) for url in old_level] + _flength = len(futures) + _i = 0 # Get the results from list of futures and update the json_data for future in as_completed(futures): + _i += 1 + _percent = int((_i / _flength) * 100) + _width = (_percent + 1) // 4 + print(" " * get_terminal_size().columns, end="\r", flush=True) + url, url_data, response_code = future.result() if isinstance(response_code, int): self.logger.debug("%s :: %d", url, response_code) @@ -234,11 +231,15 @@ def crawl(self) -> Dict[str, List[str]]: # Add url_data to crawled links. cur_level = cur_level.union(url_data) - print(f"-- Results: {len(cur_level)}\r", end="", flush=True) + print( + f"[{'#'*_width}{' '*(25-_width)}]{_percent: >3}% -- Results: {len(cur_level)}", + end="\r", + flush=True, + ) - # Adding to json data - json_data[url] = list(url_data) + self.db.create_linkage(url, list(url_data)) + print(" " * get_terminal_size().columns, end="\r", flush=True) # Get the next level withouth duplicates. clean_cur_level = cur_level.difference(ord_lst) # Merge both ord_lst and cur_level into ord_lst @@ -249,23 +250,16 @@ def crawl(self) -> Dict[str, List[str]]: cur_level = set() self.logger.info("Step %d completed :: %d result(s)", index + 1, len(ord_lst)) - # Creating json - with open(self.__files["network_structure"], "w", encoding="UTF-8") as lst_file: - json.dump(json_data, lst_file, indent=2, sort_keys=False) + for label, data in self.extras.items(): + self.db.create_labeled_link(label, data) - with open(self.__files["links"], "w+", encoding="UTF-8") as file: - for url in sorted(ord_lst): - file.write(f"{url}\n") + self.extras = defaultdict(lambda: defaultdict(list)) + session.close() # Pause time time.sleep(self.pause) # Close the executor, don't wait for all threads to finish self.__executor.shutdown(wait=False) - # Close the output files and return the json_data - for file in self.__files.values(): - if isinstance(file, TextIOBase): - file.close() - - return json_data + return self.db.get_network_structure() diff --git a/modules/extractor.py b/modules/extractor.py index 4eac376..28a1587 100644 --- a/modules/extractor.py +++ b/modules/extractor.py @@ -1,28 +1,20 @@ import logging import os -import re from concurrent.futures import ThreadPoolExecutor, as_completed from http.client import IncompleteRead, InvalidURL from io import TextIOWrapper from logging import Logger -from typing import Dict, List, Optional, Tuple, Union +from shutil import get_terminal_size +from typing import Dict, List, Optional from urllib.error import HTTPError, URLError -from urllib.parse import urlparse import requests import yara as _yara from bs4 import BeautifulSoup +from neo4j.time import DateTime from modules.checker import folder -from modules.helper import get_requests_header - -# Type hinting aliases -ExcInfo = Union[Exception, bool] -LogMsg = Tuple[str] -LogLevel = int -Log = Tuple[LogLevel, LogMsg, ExcInfo] -SingleRes = List[Log] -Results = List[SingleRes] +from modules.helper import DatabaseManager, Result, get_requests_header class Extractor: @@ -36,6 +28,7 @@ class Extractor: input_file: Filename of crawled/discovered URLs. out_path: Dir path for output files. thread: Number pages to extract (Threads) at the same time. + db: Neo4j :class:`DatabaseManager` object yara: keyword search option. logger: A logger object to log the output. """ @@ -53,6 +46,7 @@ def __init__( input_file: str, out_path: str, thread: int, + db: DatabaseManager, yara: Optional[int], logger: Logger, ): @@ -67,69 +61,45 @@ def __init__( self.out_path = folder(os.path.join(out_path, self.__extract_folder)) self.thread = thread + self.db = db self.yara = yara self.logger = logger self.__executor = ThreadPoolExecutor(max_workers=min(32, self.thread)) self.__session = self.__get_tor_session() - def extract(self) -> Results: + def extract(self) -> Optional[str]: """Extracts the contents of the input file/single URL into the outputs folder/file/terminal. - Note: - `Log` represents either Yara or an Exception output. - - If Yara search is True then content is written to file/terminal. - - If Yara search is False then content is ignored. - - A `Log` is a tuple of `LogLevel`, `LogMsg` and `ExcInfo` with the following format: - (`LogLevel`, (`msg`, `*args`), `Exception()` or `False`) - - `SingleRes` of an url is a list of `Log` with the following format: - [ - (10, ("%s :: %s match found!", "`http://example.com/file.html`", "Yara"), False), - - (10, ("IOError Error :: %s", "`http://example.com/file.html`"), IOError()), - ] - Returns: - `Results` of an input which is a List of `SingleRes` with the following format: - - [[ - (10, ("%s :: %s match found!", "`http://example.com`", "Yara"), False), - - (10, ("File created :: %s", "example.com/extracted/example.com/_.html"), False), - ], [ - (10, ("%s :: %s match found!", "`http://example.com/main.html`", "No yara"), False) - ], [ - (10, ("%s :: %s match found!", "`http://example.com/file.html`", "Yara"), False), - - (10, ("IOError Error :: %s", "`http://example.com/file.html`"), IOError()), - ]] + None """ - results: Results = [] if len(self.input_file) > 0: - if self.crawl or self.out_path: - # Crawl(output folder) | INput file | EXtract - results = self.__cinex(self.input_file, self.out_path, self.yara) + if self.crawl: + # Crawl | INput db | EXtract + self.__cinex(self.yara, self.db) else: # TERMinal | INput file | EXtract - results = self.__terminex(self.input_file, self.yara) + self.__terminex(self.input_file, self.yara) else: if len(self.output_file) > 0: # OUTput file | EXtract self.output_file = os.path.join(self.out_path, self.output_file) - single_res = self.__outex(self.website, self.output_file, self.yara) + result = self.__outex(self.website, self.output_file, self.yara) else: # TERMinal | EXtract - single_res = self.__termex(self.website, self.yara) + result = self.__termex(self.website, self.yara) - for level, args, exception in single_res: - self.logger.log(level, *args, exc_info=exception) + for log_type in (result.yara, result.extract, result.error): + if log_type is not None: + level, args, exception = log_type + self.logger.log(level, *args, exc_info=exception) - results.append(single_res) - return results + dataset_path = os.path.join(os.getcwd(), self.out_path, "dataset.csv") + if self.db.save_all_scrape_data_as_csv(file_path=dataset_path): + self.logger.info("Dataset created :: %s", dataset_path) + return dataset_path + return None def __get_tor_session(self) -> requests.Session: """Get a new session with Tor proxies. @@ -143,23 +113,25 @@ def __get_tor_session(self) -> requests.Session: session.verify = False return session - def __cinex(self, input_file: str, out_path: str, yara: Optional[int]) -> Results: - """Ingests the crawled links from the input_file, + # Depricate input file extraction to an output folder after crawling + # Use database to get all nodes, extract them and save ouput to the database. + def __cinex(self, yara: Optional[int], db: DatabaseManager) -> None: + """Ingests the crawled links from the database, scrapes the contents of the resulting web pages and writes the contents to - the into out_path/{url_address}. + the Database. Args: - input_file: Filename of the crawled Urls. - out_path: Dir path for results. yara: Keyword search argument. + db: Neo4j :class:`DatabaseManager` object Returns: - List of `SingleRes` for each url in input. + None """ - self.logger.info("Cinex :: Extracting from %s to %s", input_file, out_path) - return self.__inex(input_file=input_file, yara=yara, out_path=out_path) + self.logger.info("Cinex :: Extracting contents of all nodes to Database") + results = self.__inex(yara=yara, db=db) + self.db.add_web_content(results) - def __terminex(self, input_file: str, yara: Optional[int]) -> Results: + def __terminex(self, input_file: str, yara: Optional[int]) -> None: """Input links from file and extract them into terminal. Args: @@ -167,12 +139,12 @@ def __terminex(self, input_file: str, yara: Optional[int]) -> Results: yara: Keyword search argument. Returns: - List of `SingleRes` for each url in input. + None """ self.logger.info("Terminex :: Extracting from %s to terminal", input_file) - return self.__inex(input_file=input_file, yara=yara) + self.__inex(input_file=input_file, yara=yara) - def __outex(self, website: str, output_file: str, yara: Optional[int]) -> SingleRes: + def __outex(self, website: str, output_file: str, yara: Optional[int]) -> Result: """Scrapes the contents of the provided web address and outputs the contents to file. @@ -182,12 +154,12 @@ def __outex(self, website: str, output_file: str, yara: Optional[int]) -> Single yara: Keyword search argument. Returns: - List of `Log` for given website. + :class:`Result` for given website. """ self.logger.info("Outex :: Extracting %s to %s", website, output_file) return self.__ex(website=website, yara=yara, output_file=output_file) - def __termex(self, website: str, yara: Optional[int]) -> SingleRes: + def __termex(self, website: str, yara: Optional[int]) -> Result: """Scrapes provided web address and prints the results to the terminal. Args: @@ -195,87 +167,80 @@ def __termex(self, website: str, yara: Optional[int]) -> SingleRes: yara: Keyword search argument. Returns: - List of `Log` for given website. + :class:`Result` for given website. """ self.logger.info("Termex :: Extracting %s to terminal", website) return self.__ex(website=website, yara=yara) - def __inex(self, input_file: str, out_path: Optional[str] = None, yara: Optional[int] = None) -> Results: + def __inex( + self, + input_file: Optional[str] = None, + yara: Optional[int] = None, + db: Optional[DatabaseManager] = None, + ) -> List[Dict]: """Ingests the crawled links from the input_file, scrapes the contents of the resulting web pages and writes the contents - into the terminal if out_path is None else out_path/{url_address}. + into the terminal if db is None else to database. Args: input_file: Filename of the crawled Urls. - out_path: Dir path for results. yara: Keyword search argument. + db: Neo4j :class:`DatabaseManager` object Returns: - List of `SingleRes` [`Results`] for each url in input. + List of :class:`Result` for each url in input. """ - file = TextIOWrapper - try: - file = open(input_file, "r", encoding="UTF-8") - except IOError as _: - self.logger.exception("Read Error :: %s", input_file) - return + results: List[Dict] = [] + urls: List[str] = [] - # Sumbit all the links to the thread pool - futures = [ - self.__executor.submit(self.__generate_file, url=url, yara=yara, out_path=out_path) - for url in file.read().splitlines() - ] + if db is not None: + urls: List[str] = db.get_all_urls() + elif input_file is not None: + file = TextIOWrapper + try: + file = open(input_file, "r", encoding="UTF-8") + urls: List[str] = file.read().splitlines() + file.close() + except IOError as _: + self.logger.exception("Read Error :: %s", input_file) + return results - results: Results = [] + # Sumbit all the links to the thread pool + futures = [self.__executor.submit(self.__ex, website=url, yara=yara) for url in urls] + _flength = len(futures) + _i = 0 # Get the results from list of futures and append them to results for future in as_completed(futures): - single_res = future.result() - results.append(single_res) - - for level, args, exception in single_res: - self.logger.log(level, *args, exc_info=exception) - + _i += 1 + _percent = int((_i / _flength) * 100) + _width = (_percent + 1) // 4 + _stmt = f"[{'#'*_width}{' '*(25-_width)}]{_percent: >3}%" + print( + _stmt + " " * max(get_terminal_size().columns - len(_stmt), 0), + end="\r", + flush=True, + ) + + result = future.result() + results.append(result.dict()) + + # db exists so don't log to terminal + if db is not None: + continue + + for log_type in (result.yara, result.extract, result.error): + if log_type is not None: + level, args, exception = log_type + self.logger.log(level, *args, exc_info=exception) + + print(" " * get_terminal_size().columns, end="\r", flush=True) # Close the executor, don't wait for all threads to finish self.__executor.shutdown(wait=False) - file.close() return results - def __generate_file(self, url: str, out_path: Optional[str], yara: Optional[int]) -> SingleRes: - """Generate output file from url and send it to extractor. - - Args: - url: Url of web address to scrape. - output_file: Filename to write the contents to. - yara: Keyword search argument. - - Returns: - List of `Log` [`SingleRes`] for given url. - """ - output_file = None - if out_path is not None: - try: - # http://a.com/b.ext?x=&y=$%z2 -> a.com/b.extxyz2_.html - uri = urlparse(url) - output_file = os.path.join( - out_path, - os.path.join(uri.netloc, *uri.path.split("/")) + re.sub(r"[^\w_.)( -]", "", uri.query) + "_.html", - ) - # Create the directory if it doesn't exist - folder(output_file, is_file=True) - except Exception as err: - return [ - ( - logging.DEBUG, - ("Output File Error :: %s", url), - err, - ) - ] - - return self.__ex(website=url, yara=yara, output_file=output_file) - - def __ex(self, website: str, output_file: str = None, yara: Optional[int] = None) -> SingleRes: + def __ex(self, website: str, output_file: Optional[str] = None, yara: Optional[int] = None) -> Result: """Scrapes the contents of the provided web address and outputs the contents to file or terminal. @@ -285,24 +250,31 @@ def __ex(self, website: str, output_file: str = None, yara: Optional[int] = None yara: Keyword search argument. Returns: - List of `Log` [`SingleRes`] for given website. + List of `Log` [`Result`] for given website. """ - result = [] + result = Result(url=website, scrape_datetime=DateTime.now()) try: content = self.__session.get(website, allow_redirects=True, timeout=10).text + raw = self.__text(response=content).lower() + result.scrape_html = content + result.scrape_data = raw + if yara is not None: - full_match_keywords = self.__check_yara(raw=content, yara=yara) - result.append( + if yara == 1: + content = raw + + full_match_keywords = self.__check_yara(data=content) + result.yara_code = 1 if full_match_keywords["matches"] else 0 + + result.yara = ( + logging.DEBUG, ( - logging.DEBUG, - ( - "%s :: %s match found!", - website, - "Yara" if full_match_keywords["matches"] else "No yara", - ), - False, - ) + "%s :: %s match found!", + website, + "Yara" if full_match_keywords["matches"] else "No yara", + ), + False, ) # Don't write to file/terminal if no matches found. @@ -312,47 +284,43 @@ def __ex(self, website: str, output_file: str = None, yara: Optional[int] = None if output_file is not None: with open(output_file, "w", encoding="UTF-8") as file: file.write(content) - result.append((logging.DEBUG, ("File created :: %s", output_file), False)) + result.extract = (logging.DEBUG, ("File created for %s :: %s", website, output_file), False) else: - result.append((logging.INFO, ("%s :: %s", website, content), False)) + result.extract = (logging.INFO, ("%s :: %s", website, content), False) except HTTPError as err: - result.append((logging.DEBUG, ("Request Error :: %s", website), err)) + result.error = (logging.DEBUG, ("Request Error :: %s", website), err) except (InvalidURL, URLError) as _: - result.append((logging.DEBUG, ("Invalid URL Error :: %s :: Skipping...", website), False)) + result.error = (logging.DEBUG, ("Invalid URL Error :: %s :: Skipping...", website), False) except IncompleteRead as _: - result.append((logging.DEBUG, ("Incomplete Read Error :: %s", website), False)) + result.error = (logging.DEBUG, ("Incomplete Read Error :: %s", website), False) except IOError as err: - result.append((logging.DEBUG, ("IOError Error :: %s", website), err)) + result.error = (logging.DEBUG, ("IOError Error :: %s", website), err) except Exception as err: - result.append((logging.DEBUG, ("Error :: %s", website), err)) + result.error = (logging.DEBUG, ("Error :: %s", website), err) return result - def __check_yara(self, raw: str, yara: int = 0) -> Dict[str, list]: + def __check_yara(self, data: str) -> Dict[str, list]: """Validates Yara Rule to categorize the site and check for keywords. Args: - yara: Keyword search argument. - raw: HTTP Response body. + data: HTTP Response body. Returns: Dictionary of yara rule matches. {"namespace":[match1,match2,...]} """ - if raw is None: + if data is None: return None - if yara == 1: - raw = self.__text(response=raw).lower() - rule_data = [] def callback(data): rule_data.append(data) return 0 # yara.CALLBACK_CONTINUE - matches = self.__yara_rules.match(data=raw, callback=callback) + _matches = self.__yara_rules.match(data=data, callback=callback) return rule_data[0] @@ -370,4 +338,4 @@ def __text(self, response: str) -> str: for s in soup(["script", "style"]): s.decompose() - return " ".join(soup.stripped_strings) + return " ".join((_.replace("\n", " ") for _ in soup.stripped_strings)) diff --git a/modules/helper/__init__.py b/modules/helper/__init__.py index ebbf52a..810ce11 100644 --- a/modules/helper/__init__.py +++ b/modules/helper/__init__.py @@ -1,3 +1,4 @@ +from .database import * from .exceptions import * from .header import * from .helper import * diff --git a/modules/helper/database.py b/modules/helper/database.py new file mode 100644 index 0000000..f5819c2 --- /dev/null +++ b/modules/helper/database.py @@ -0,0 +1,370 @@ +import os +import threading +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, List, Literal, Optional, Tuple, Union + +from dotenv import load_dotenv +from neo4j import GraphDatabase, Record +from neo4j.exceptions import AuthError, ClientError, ServiceUnavailable +from neo4j.time import DateTime +from neo4j.work import ResultSummary + +from modules.helper.logger import setup_custom_logger + + +@dataclass +class Result: + """Extractor scraping result class + + (`LogLevel`, (`msg`, `*args`), `Exception()` or `False`) + + Attributes: + url: Uniform Resource Locator of the webpage + index_counter: An automatic incremental counter; the number of times a webpage is encountered while crawling + index_datetime: :class:`neo4j.time.DateTime` automatically created while crawling + scrape_datetime: :class:`neo4j.time.DateTime` manually created while extracting + scrape_html: HTML content of the webpage + scrape_data: Text content of the webpage + yara_code: Yara match status of the page""" + + url: str + # index_counter: int = 0 + # index_datetime: DateTime = field(default_factory=DateTime.now) + scrape_datetime: Optional[DateTime] = None + scrape_html: Optional[str] = None + scrape_data: Optional[str] = None + yara_code: Optional[int] = None + yara: Optional[Tuple[int, Tuple[str], Union[Exception, Literal[False]]]] = None + extract: Optional[Tuple[int, Tuple[str], Union[Exception, Literal[False]]]] = None + error: Optional[Tuple[int, Tuple[str], Union[Exception, Literal[False]]]] = None + + def __str__(self) -> str: + return f"{{yara: {self.yara}, extract: {self.extract}, error: {self.error}}}" + + def __repr__(self) -> str: + return f"Result(yara: {self.yara}, extract: {self.extract}, error: {self.error})" + + def dict(self) -> Dict[str, Any]: + """Return dictionry representation for Neo4j""" + return { + "url": self.url, + "scrape_datetime": self.scrape_datetime, + "scrape_html": self.scrape_html, + "scrape_data": self.scrape_data, + "yara": self.yara, + } + + +class DatabaseManager: + """Instantiates the Neo4J Graph Database + + Attributes: + out_path: Output path for the log files + server: URI examples: "bolt://localhost:7687", "neo4j+s://xxx.databases.neo4j.io" + user: database user + password: database password + """ + + server = None + user = None + password = None + driver = None + logger = None + write_lock = threading.Lock() + labels = [("WebPage", "url")] + keys = {"WebPage": ["url", "index_counter", "index_datetime", "scrape_datetime", "scrape_html", "scrape_data"]} + + def __init__(self, out_path: str, server: str, user: str = None, password: str = None): + self.logger = setup_custom_logger( + name="dblog", filename=os.path.join(out_path, "db.log"), verbose_=True, filelog=True, screenlog=False + ) + self.server = server + self.user = user + self.password = password + self.get_graph_driver(self.server, self.user, self.password) + self.create_indexes() + + @staticmethod + def _transaction_function(tx, **kwargs): + result = tx.run(kwargs.pop("query"), **kwargs) + records = list(result) # a list of Record objects + summary = result.consume() + return records, summary + + def query(self, requested: bool = False, **kwargs) -> Optional[Tuple[List[Record], ResultSummary]]: + """execute a query into the graph""" + # self.write_lock.acquire() + records = [] + summary: ResultSummary = None + _query = kwargs.get("query", None) + try: + with self.driver.session(database="neo4j") as session: + records, summary = session.execute_write(transaction_function=self._transaction_function, **kwargs) + except ClientError as e: + if str(e.message).startswith("An equivalent index"): + raise + self.logger.error("ClientError :: Transaction failed with %s", e.message, exc_info=e) + except Exception as e: + self.logger.error("Error :: Transaction failed. Query :: %s", _query, exc_info=e) + finally: + # self.write_lock.release() + if requested and records and summary: + return records, summary + + def get_graph_driver(self, uri: str, username: str, password: str) -> None: + """sets up graph client""" + try: + auth = None + if username and password: + auth = (username, password) + self.driver = GraphDatabase.driver(uri, auth=auth, encrypted=False) + self.driver.verify_connectivity() + self.logger.info("Neo4J database connectivity successful") + except AuthError as e: + self.logger.error("AuthError :: Could not authenticate to Neo4j database server", exc_info=e) + raise + except Exception as e: + self.logger.error("Error :: Failed to create graph client", exc_info=e) + raise + + def create_indexes(self): + """create indexes for faster lookup""" + for label, _property in self.labels: + query = f"CREATE INDEX IF NOT EXISTS FOR (n:{label}) ON (n.{_property})" + try: + self.query(requested=False, query=query) + except ClientError: + pass + + def get_all_urls(self) -> List[str]: + """ + Returns: + List of all urls in the database + """ + query = "MATCH (w:WebPage) RETURN w.url AS url" + result = self.query(requested=True, query=query) + if not result: + return [] + records, summary = result + all_urls = [row["url"] for row in records] + self.logger.info("(%d ms) get_all_urls()->(%s)", summary.result_available_after, all_urls) + + return all_urls + + def get_network_structure(self) -> Dict[str, List[str]]: + """ + Returns: + List of all urls in the database + """ + query = "MATCH (w1:WebPage)-[:POINTS_TO]->(w2:WebPage) RETURN w1.url AS url, COLLECT(w2.url) AS points_to" + result = self.query(requested=True, query=query) + network_structure = {} + if not result: + return network_structure + records, summary = result + for row in records: + network_structure[row["url"]] = row["points_to"] + self.logger.info("(%d ms) get_network_structure()->(%s)", summary.result_available_after, network_structure) + + return network_structure + + def get_all_scrape_data(self) -> List[str]: + """ + Returns: + List of all scrape_data in the database + """ + query = "MATCH (w:WebPage) RETURN w.scrape_data AS scrape_data" + result = self.query(requested=True, query=query) + if not result: + return [] + records, summary = result + all_scrape_data = [row["scrape_data"] for row in records] + self.logger.info("(%d ms) get_all_scrape_data()->()", summary.result_available_after) + + return all_scrape_data + + def save_all_scrape_data_as_csv(self, file_path=None) -> bool: + """ + Args: + file_path: Location of the file to save the csv to. + + Returns: + None + """ + if file_path is None: + return + file_path = ("file:///" + file_path).replace("\\", "/") + query = f'CALL apoc.export.csv.query("MATCH (w:WebPage) RETURN w.url as url, w.scrape_data AS scrape_data", "{file_path}", {{}})' + result = self.query(requested=True, query=query) + if not result: + return False + records, summary = result + self.logger.info( + "(%d ms) save_all_scrape_data_as_csv()->(%s)", summary.result_available_after, records[0]["file"] + ) + return True + + def create_linkage(self, wp1_url: str, hyperlinks: List[str]) -> bool: + """Create links between a WebPage with URL `wp1_url` containing a list of `hyperlinks` + + Args: + wp1_url: URL of WebPage 1 + hyperlinks: List of URLs in WebPage 1 + + Returns: + None + """ + query = f""" + UNWIND $hyperlinks AS link + MERGE (w1:WebPage {{ url: "{wp1_url}" }}) + ON CREATE + SET + w1.index_counter = 1, + w1.index_datetime = datetime() + MERGE (w2:WebPage {{ url: link }}) + SET + w2.index_counter = COALESCE(w2.index_counter, 0) + 1, + w2.index_datetime = datetime() + MERGE (w1)-[:POINTS_TO]->(w2) + RETURN w1{{.url, .index_counter, .index_datetime, .scrape_datetime, .yara_code}}, w2{{.url, .index_counter, .index_datetime, .scrape_datetime, .yara_code}}""" + result = self.query(requested=True, query=query, hyperlinks=hyperlinks) + if not result: + return False + records, summary = result + self.logger.info("(%d ms) Created linkage between following items", summary.result_available_after) + for row in records: + self.logger.info("(%s)-[:POINTS_TO]->(%s)", row["w1"], row["w2"]) + return True + + def create_labeled_link(self, label: str, hyperlinks: Dict[str, List[str]]) -> bool: + """Create labeled links between `hyperlinks[i][0]` containing `hyperlinks[i][1]` + + Args: + label: Label of link like "Extlink", "Mail", or "Telephone" + hyperlinks: List of pairwise URLs with [`hyperlinks[i][0]` contains `hyperlinks[i][1]`] relationship + + Returns: + None + """ + query = f""" + UNWIND keys($hyperlinks) AS parent + WITH parent, $hyperlinks[parent] AS content + UNWIND content AS link + MERGE (w1:WebPage {{ url: parent }}) + ON CREATE + SET + w1.index_counter = 1, + w1.index_datetime = datetime() + MERGE (w2:{label} {{ url: link }}) + SET + w2.index_counter = COALESCE(w2.index_counter, 0) + 1, + w2.index_datetime = datetime() + MERGE (w1)-[:CONTAINS]->(w2) + RETURN w1{{.url, .index_counter, .index_datetime, .scrape_datetime, .yara_code}}, COLLECT(w2{{.url, .index_counter, .index_datetime}}) as w2""" + result = self.query(requested=True, query=query, hyperlinks=hyperlinks) + if not result: + return False + records, summary = result + self.logger.info( + "(%d ms) Created '%s' relationship between following items", summary.result_available_after, label + ) + for row in records: + self.logger.info("(%s)-[:CONTAINS]->(%s)", row["w1"], row["w2"]) + return True + + def add_web_content(self, data: List[Dict]) -> bool: + """Add a list of dictionaries derived from :class:`Result` objects to the database. Each dictionary contains keys like url, scrape_datetime, scrape_html, scrape_data, yara_code + + Args: + data: List of Dict derived from :class:`Result` objects + + Returns: + None + """ + query = """ + UNWIND $data AS page + MERGE (w1:WebPage { url: page.url }) + ON CREATE + SET + w1.index_counter = 1, + w1.index_datetime = datetime() + SET + w1.scrape_datetime = page.scrape_datetime, + w1.scrape_data = page.scrape_data, + w1.scrape_html = page.scrape_html, + w1.yara = page.yara + RETURN w1{.url, .index_counter, .index_datetime, .scrape_datetime, .yara_code}""" + result = self.query(requested=True, query=query, data=data) + if not result: + return False + records, summary = result + self.logger.info("(%d ms) Added web content for following items", summary.result_available_after) + for row in records: + self.logger.info("%s", row["w1"]) + return True + + def db_summary(self): + """Summary about the database""" + count_query = self.query( + requested=False, query="MATCH (n) RETURN count(labels(n)) AS count, labels(n) AS labels" + ) + return count_query + + def delete_db(self): + """Delete all the nodes and relationships in the database""" + self.query(requested=False, query="MATCH (n) DETACH DELETE n") + self.logger.info("Neo4J database cleaned") + + # def __del__(self): + # self.shutdown() + + def shutdown(self): + """Close the driver connection""" + if self.driver: + self.logger.info("Closing the Neo4J session") + self.driver.close() + + +if __name__ == "__main__": + # Aura queries use an encrypted connection using the "neo4j+s" URI scheme + load_dotenv() + + app = DatabaseManager( + "output", + os.environ.get("NEO4J_SERVER"), + os.environ.get("NEO4J_USER"), + os.environ.get("NEO4J_PASSWORD"), + ) + app.delete_db() + + app.create_linkage("ABC", "DEF") + app.create_linkage("DEF", "GHI") + app.create_linkage("DEF", "JKL") + app.create_linkage("JKL", "ABC") + + extras = defaultdict(lambda: defaultdict(list)) + extras["Mail"]["ABC"] = ["ABC Mail", "ABC Mail2"] + extras["Mail"]["DEF"] = ["DEF Mail", "DEF Mail2"] + extras["Mail"]["GHI"] = ["GHI Mail", "GHI Mail2"] + extras["Telephone"]["ABC"] = ["ABC Telephone", "ABC Telephone2"] + extras["Telephone"]["DEF"] = ["DEF Telephone", "DEF Telephone2"] + extras["Telephone"]["GHI"] = ["GHI Telephone", "GHI Telephone2"] + for label, data in extras.items(): + app.create_labeled_link(label, data) + + data = [ + Result( + url="ABC", scrape_datetime=DateTime.now(), scrape_data="ABC DATA", scrape_html="ABC HTML", yara=0 + ).dict(), + Result(url="DEF", scrape_datetime=DateTime.now(), scrape_data="DEF DATA", scrape_html="DEF HTML").dict(), + Result( + url="GHI", scrape_datetime=DateTime.now(), scrape_data="GHI DATA", scrape_html="GHI HTML", yara=1 + ).dict(), + ] + app.add_web_content(data=data) + + app.get_network_structure() + dataset_path = os.path.join(os.getcwd(), "output", "dataset.csv") + app.save_all_scrape_data_as_csv(file_path=dataset_path) + app.shutdown() diff --git a/modules/helper/helper.py b/modules/helper/helper.py index 5106a3d..db53bb9 100644 --- a/modules/helper/helper.py +++ b/modules/helper/helper.py @@ -1,6 +1,7 @@ import difflib import os import sys +from functools import wraps from io import StringIO from typing import Dict @@ -34,6 +35,7 @@ def __exit__(self, *args): def verbose(func): """Verbose decorator""" + @wraps(func) def wrapper(*args, **kwargs): args[0].logger.info("Generating :: %s..", func.__doc__) plt.cla() @@ -43,8 +45,6 @@ def wrapper(*args, **kwargs): plt.savefig(os.path.join(args[0].out_path, f"{func.__name__}.png"), bbox_inches="tight") return ret - wrapper.__doc__ = func.__doc__ - wrapper.__name__ = func.__name__ return wrapper @@ -81,7 +81,16 @@ def get_tor_proxies(port: int = 9050) -> Dict[str, str]: } -def assertMsg(expected, result): +def assert_msg(expected: object, result: object) -> str: + """Compare and print difference between 2 objects. Objects must have string reprensentation. + + Args: + expected: theoritical value + result: observed value + + Returns: + Colored text difference between the string representation of the objects. + """ old, new = str(expected), str(result) bold = lambda text: f"{Colors.BOLD}{text}{Colors.RESET}" diff --git a/modules/helper/logger.py b/modules/helper/logger.py index 7b9d677..e16d84a 100644 --- a/modules/helper/logger.py +++ b/modules/helper/logger.py @@ -3,7 +3,7 @@ import sys import time from logging.handlers import RotatingFileHandler -from typing import List +from typing import List, Optional from modules.helper.header import Colors @@ -12,10 +12,10 @@ class RollingFileHandler(RotatingFileHandler): """Custom RotatingFileHandler for incremental infinite logging""" def __init__(self, filename, mode="a", maxBytes=0, backupCount=0, encoding=None, delay=False, errors=None): - self.last_backup_cnt = int(time.time()) + self.last_backup_cnt = 0 self.filename = filename super(RollingFileHandler, self).__init__( - filename="{0}.{2}.init{1}".format(*os.path.splitext(self.filename), self.last_backup_cnt), + filename="{0}.{2}{1}".format(*os.path.splitext(self.filename), self.last_backup_cnt), mode=mode, maxBytes=maxBytes, backupCount=backupCount, @@ -58,7 +58,12 @@ def format(self, record): def setup_custom_logger( - name: str, filename: str = "log.log", verbose_: bool = False, filelog: bool = True, argv: List[str] = None + name: str, + filename: str = "log.log", + verbose_: bool = False, + filelog: bool = True, + screenlog: bool = True, + argv: Optional[List[str]] = None, ) -> logging.Logger: """Setup custom logger with stream and file handlers @@ -77,7 +82,7 @@ def setup_custom_logger( # Create file handler if filelog is True if filelog: - file_handler = RollingFileHandler(filename=filename, mode="w", maxBytes=1024 * 1024 * 10) + file_handler = RollingFileHandler(filename=filename, mode="w", maxBytes=1024 * 1024 * 10) # 10MB file_handler.setFormatter(None) file_handler.setLevel(logging.DEBUG) logger.addHandler(file_handler) @@ -95,6 +100,10 @@ def setup_custom_logger( ) ) + # Return logger if screen log is disabled + if not screenlog: + return logger + formatter, fmt, level = ( (CustomFormatter, "[{color}{{levelname:^7s}}{reset}] {{message}}", logging.DEBUG) if verbose_ diff --git a/modules/tests/test_checker.py b/modules/tests/test_checker.py index 7b10a7d..2b582fb 100644 --- a/modules/tests/test_checker.py +++ b/modules/tests/test_checker.py @@ -6,7 +6,7 @@ from unittest import mock from modules.checker import check_ip, check_tor, extract_domain, folder, url_canon -from modules.helper import TorProxyException, TorServiceException, assertMsg, get_tor_proxies, setup_custom_logger +from modules.helper import TorProxyException, TorServiceException, assert_msg, get_tor_proxies, setup_custom_logger class MockedPsutilProcess: @@ -64,42 +64,42 @@ def test_url_canon_001(self): url = "www.darkspider.com" expected = (True, "http://www.darkspider.com") result = url_canon(url, www=False) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) def test_url_canon_002(self): """url_canon unit test.""" url = "www.darkspider.com" expected = (True, "http://www.darkspider.com") result = url_canon(url, www=True) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) def test_url_canon_003(self): """url_canon unit test.""" url = "darkspider.com" expected = (True, "http://www.darkspider.com") result = url_canon(url, www=True) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) def test_url_canon_004(self): """url_canon unit test.""" url = "http://darkspider.com/" expected = (False, "http://darkspider.com") result = url_canon(url, www=False) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) def test_extract_domain_001(self): """extract_domain test.""" url = "http://darkspider.com/test/domain-extract/api?id=001" expected = "darkspider.com" result = extract_domain(url, remove_http=True) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) def test_extract_domain_002(self): """extract_domain test.""" url = "http://darkspider.com/test/domain-extract/api?id=002" expected = "http://darkspider.com" result = extract_domain(url, remove_http=False) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) def test_folder_creation_001(self): """folder creation test.""" @@ -113,7 +113,7 @@ def test_folder_creation_002(self): result = folder(_input, True) expected = os.path.dirname(_input) self.assertTrue(os.path.exists(expected), f"Test Fail:: could not find directory of {_input}") - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) def test_check_ip_001(self): """check_ip test.""" diff --git a/modules/tests/test_extractor.py b/modules/tests/test_extractor.py index c0eedca..5341cb7 100644 --- a/modules/tests/test_extractor.py +++ b/modules/tests/test_extractor.py @@ -7,7 +7,7 @@ from modules.checker import folder from modules.extractor import Extractor -from modules.helper import assertMsg, setup_custom_logger +from modules.helper import assert_msg, setup_custom_logger URL_1 = "http://info.cern.ch/" URL_2 = "http://info.cern.ch/hypertext/WWW/TheProject.html" @@ -95,7 +95,7 @@ def test_text(self, _): expected = """http://info.cern.ch http://info.cern.ch - home of the first website From here you can: Browse the first website Browse the first website using the line-mode browser simulator Learn about the birth of the web Learn about CERN, the physics laboratory where the web was born""" content = self.get_response_text(URL_1) result = self.extractor_1._Extractor__text(response=content) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) def test_check_yara_001(self, _): """check_yara unit test.""" @@ -120,7 +120,7 @@ def test_check_yara_001(self, _): content = self.get_response_text(URL_1) result = self.extractor_1._Extractor__check_yara(raw=content, yara=0) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) def test_check_yara_002(self, _): """check_yara unit test. @@ -145,7 +145,7 @@ def test_check_yara_002(self, _): content = self.get_response_text(URL_1) result = self.extractor_1._Extractor__check_yara(raw=content, yara=1) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) @mock.patch("concurrent.futures.ThreadPoolExecutor.shutdown", side_effect=[lambda wait: None]) def test_cinex_001(self, _, __): @@ -171,7 +171,7 @@ def test_cinex_001(self, _, __): result = self.extractor_1._Extractor__cinex(self.inp_file, self.out_path, 0) - self.assertCountEqual(expected, result, assertMsg(expected, result)) + self.assertCountEqual(expected, result, assert_msg(expected, result)) @mock.patch("concurrent.futures.ThreadPoolExecutor.shutdown", side_effect=[lambda wait: None]) def test_cinex_002(self, _, __): @@ -190,7 +190,7 @@ def test_cinex_002(self, _, __): result = self.extractor_1._Extractor__cinex(self.inp_file, self.out_path, None) - self.assertCountEqual(expected, result, assertMsg(expected, result)) + self.assertCountEqual(expected, result, assert_msg(expected, result)) @mock.patch("concurrent.futures.ThreadPoolExecutor.shutdown", side_effect=[lambda wait: None]) def test_terminex_001(self, _, __): @@ -220,7 +220,7 @@ def test_terminex_001(self, _, __): result = self.extractor_1._Extractor__terminex(self.inp_file, 1) - self.assertCountEqual(expected, result, assertMsg(expected, result)) + self.assertCountEqual(expected, result, assert_msg(expected, result)) @mock.patch("concurrent.futures.ThreadPoolExecutor.shutdown", side_effect=[lambda wait: None]) def test_terminex_002(self, _, __): @@ -247,7 +247,7 @@ def test_terminex_002(self, _, __): result = self.extractor_1._Extractor__terminex(self.inp_file, None) - self.assertCountEqual(expected, result, assertMsg(expected, result)) + self.assertCountEqual(expected, result, assert_msg(expected, result)) @mock.patch("concurrent.futures.ThreadPoolExecutor.shutdown", side_effect=[lambda wait: None]) def test_outex_001(self, _, __): @@ -259,7 +259,7 @@ def test_outex_001(self, _, __): result = self.extractor_1._Extractor__outex(URL_1, self.out_file, 0) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) @mock.patch("concurrent.futures.ThreadPoolExecutor.shutdown", side_effect=[lambda wait: None]) def test_outex_002(self, _, __): @@ -270,7 +270,7 @@ def test_outex_002(self, _, __): result = self.extractor_1._Extractor__outex(URL_1, self.out_file, None) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) @mock.patch("concurrent.futures.ThreadPoolExecutor.shutdown", side_effect=[lambda wait: None]) def test_termex_001(self, _, __): @@ -282,7 +282,7 @@ def test_termex_001(self, _, __): result = self.extractor_1._Extractor__termex(URL_1, 1) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) @mock.patch("concurrent.futures.ThreadPoolExecutor.shutdown", side_effect=[lambda wait: None]) def test_termex_002(self, _, __): @@ -293,7 +293,7 @@ def test_termex_002(self, _, __): result = self.extractor_1._Extractor__termex(URL_1, None) - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) @mock.patch("concurrent.futures.ThreadPoolExecutor.shutdown", side_effect=[lambda wait: None]) def test_extractor_001(self, _, __): @@ -319,7 +319,7 @@ def test_extractor_001(self, _, __): result = self.extractor_1.extract() - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) @mock.patch("concurrent.futures.ThreadPoolExecutor.shutdown", side_effect=[lambda wait: None]) def test_extractor_002(self, _, __): @@ -352,7 +352,7 @@ def test_extractor_002(self, _, __): result = extractor_2.extract() - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) @mock.patch("concurrent.futures.ThreadPoolExecutor.shutdown", side_effect=[lambda wait: None]) def test_extractor_003(self, _, __): @@ -371,7 +371,7 @@ def test_extractor_003(self, _, __): result = extractor_3.extract() - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) @mock.patch("concurrent.futures.ThreadPoolExecutor.shutdown", side_effect=[lambda wait: None]) def test_extractor_004(self, _, __): @@ -390,4 +390,4 @@ def test_extractor_004(self, _, __): result = extractor_4.extract() - self.assertEqual(expected, result, assertMsg(expected, result)) + self.assertEqual(expected, result, assert_msg(expected, result)) diff --git a/modules/visualization.py b/modules/visualization.py index 6014555..83e08fb 100644 --- a/modules/visualization.py +++ b/modules/visualization.py @@ -1,4 +1,3 @@ -import json import os import sys from collections import Counter @@ -21,15 +20,11 @@ class Visualization: logger: A logger object to log the output. """ - def __init__(self, json_file, out_path, logger): - self.json_file = json_file + def __init__(self, json_data, out_path, logger): + self.data = json_data self.logger = logger - self.out_path = folder(os.path.join(out_path, "visualization")) - with open(self.json_file, "r", encoding="UTF-8") as f: - self.data = json.load(f) - self.G = nx.DiGraph() self.G.add_nodes_from(self.data.keys()) for key, value in self.data.items(): diff --git a/requirements.txt b/requirements.txt index a08f041..51ceb7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,10 @@ matplotlib>=3.6.1 networkx>=2.8.8 psutil>=5.9.2 requests>=2.25.1 +pysocks>=1.7.1 seaborn>=0.11.1 +scipy>=1.9.3 yara-python>=4.2.0 lxml>=4.9.1 +neo4j==5.5.0 +python-dotenv>=1.0.0 diff --git a/requirements_dev.txt b/requirements_dev.txt index f9b7cf5..430e399 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,3 +1,3 @@ -r requirements.txt -pysocks>=1.7.1 +pytest>=7.1.3 coverage>=7.0.5