Skip to content
This repository has been archived by the owner on Dec 17, 2021. It is now read-only.

Commit

Permalink
Merge pull request #229 from 18F/gather-abc
Browse files Browse the repository at this point in the history
Abstract base class approach for gatherers
  • Loading branch information
tadhg-ohiggins authored Mar 31, 2018
2 parents 331d3fc + e0a9585 commit d227489
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 124 deletions.
9 changes: 6 additions & 3 deletions gather
Original file line number Diff line number Diff line change
Expand Up @@ -54,21 +54,24 @@ def run(options=None, cache_dir="./cache", results_dir="./results"):
extra = {}

try:
gatherer = importlib.import_module("gatherers.%s" % source)
gatherer_module = importlib.import_module(
"gatherers.%s" % source)
gatherer = gatherer_module.Gatherer(suffixes, options, extra)
except ImportError:
# If it's not a registered module, allow it to be "hot registered"
# as long as the user gave us a flag with that name that can be
# used as the --url option to the URL module.
if options.get(source):
gatherer = importlib.import_module("gatherers.url")
gatherer_module = importlib.import_module("gatherers.url")
extra['name'] = source
gatherer = gatherer_module.Gatherer(suffixes, options, extra)
else:
exc_type, exc_value, exc_traceback = sys.exc_info()
logging.error("[%s] Gatherer not found, or had an error during loading.\n\tERROR: %s\n\t%s" % (source, exc_type, exc_value))
exit(1)

# Iterate over each hostname.
for domain in gatherer.gather(suffixes, options, extra):
for domain in gatherer.gather():

# Always apply the suffix filter to returned names.
if not suffix_pattern.search(domain):
Expand Down
157 changes: 79 additions & 78 deletions gatherers/censys.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import os
import json
import csv
import json
import logging
import os
from typing import List

from google.cloud import bigquery
from google.oauth2 import service_account
import google.api_core.exceptions

from gatherers.gathererabc import Gatherer
from utils import utils

# Options:
Expand Down Expand Up @@ -36,82 +37,82 @@
default_timeout = 60 * 60 * 10


def gather(suffixes, options, extra={}):

# Returns a parsed, processed Google service credentials object.
credentials = load_credentials()

if credentials is None:
logging.warn("No BigQuery credentials provided.")
logging.warn("Set BIGQUERY_CREDENTIALS or BIGQUERY_CREDENTIALS_PATH environment variables.")
exit(1)

# When using this form of instantiation, the client won't pull
# the project_id out of the creds, has to be set explicitly.
client = bigquery.Client(
project=credentials.project_id,
credentials=credentials
)

# Allow override of default timeout (in seconds).
timeout = int(options.get("timeout", default_timeout))

# Construct the query.
query = query_for(suffixes)
logging.debug("Censys query:\n%s\n" % query)

# Hardcode this for now:
cache_dir = "./cache"
# Plan to store in cache/censys/export.csv.
download_path = utils.cache_path("export", "censys", ext="csv",
cache_dir=cache_dir)

# Reuse of cached data can be turned on with --cache.
cache = options.get("cache", False)
if (cache is True) and os.path.exists(download_path):
logging.warn("Using cached download data.")

# But by default, fetch new data from the BigQuery API,
# and write it to the expected download location.
else:
logging.warn("Kicking off SQL query job.")

rows = None

# Actually execute the query.
try:
# Executes query and loads all results into memory.
query_job = client.query(query)
iterator = query_job.result(timeout=timeout)
rows = list(iterator)
except google.api_core.exceptions.Forbidden:
logging.warn("Access denied to Censys' BigQuery tables.")
except:
logging.warn(utils.format_last_exception())
logging.warn("Error talking to BigQuery, aborting.")

# At this point, the job is complete and we need to download
# the resulting CSV URL in results_url.
logging.warn("Caching results of SQL query.")

download_file = open(download_path, 'w', newline='')
download_writer = csv.writer(download_file)
download_writer.writerow(["Domain"]) # will be skipped on read

# Parse the rows and write them out as they were returned (dupes
# and all), to be de-duped by the central gathering script.
for row in rows:
domains = row['common_name'] + row['dns_names']
for domain in domains:
download_writer.writerow([domain])

# End CSV writing.
download_file.close()

# Whether we downloaded it fresh or not, read from the cached data.
for domain in utils.load_domains(download_path):
if domain:
yield domain
class Gatherer(Gatherer):

def gather(self):

# Returns a parsed, processed Google service credentials object.
credentials = load_credentials()

if credentials is None:
logging.warn("No BigQuery credentials provided.")
logging.warn("Set BIGQUERY_CREDENTIALS or BIGQUERY_CREDENTIALS_PATH environment variables.")
exit(1)

# When using this form of instantiation, the client won't pull
# the project_id out of the creds, has to be set explicitly.
client = bigquery.Client(
project=credentials.project_id,
credentials=credentials
)

# Allow override of default timeout (in seconds).
timeout = int(self.options.get("timeout", default_timeout))

# Construct the query.
query = query_for(self.suffixes)
logging.debug("Censys query:\n%s\n" % query)

# Plan to store in cache/censys/export.csv.
download_path = utils.cache_path(
"export", "censys", ext="csv", cache_dir=self.cache_dir)

# Reuse of cached data can be turned on with --cache.
cache = self.options.get("cache", False)
if (cache is True) and os.path.exists(download_path):
logging.warn("Using cached download data.")

# But by default, fetch new data from the BigQuery API,
# and write it to the expected download location.
else:
logging.warn("Kicking off SQL query job.")

rows = None

# Actually execute the query.
try:
# Executes query and loads all results into memory.
query_job = client.query(query)
iterator = query_job.result(timeout=timeout)
rows = list(iterator)
except google.api_core.exceptions.Forbidden:
logging.warn("Access denied to Censys' BigQuery tables.")
except:
logging.warn(utils.format_last_exception())
logging.warn("Error talking to BigQuery, aborting.")

# At this point, the job is complete and we need to download
# the resulting CSV URL in results_url.
logging.warn("Caching results of SQL query.")

download_file = open(download_path, 'w', newline='')
download_writer = csv.writer(download_file)
download_writer.writerow(["Domain"]) # will be skipped on read

# Parse the rows and write them out as they were returned (dupes
# and all), to be de-duped by the central gathering script.
for row in rows:
domains = row['common_name'] + row['dns_names']
for domain in domains:
download_writer.writerow([domain])

# End CSV writing.
download_file.close()

# Whether we downloaded it fresh or not, read from the cached data.
for domain in utils.load_domains(download_path):
if domain:
yield domain


# Constructs the query to run in BigQuery, against Censys'
Expand Down
18 changes: 18 additions & 0 deletions gatherers/gathererabc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from abc import ABCMeta, abstractmethod
import os
from typing import List


class Gatherer(metaclass=ABCMeta):

def __init__(self, suffixes: List[str], options: dict, extra: dict={}):
self.suffixes = suffixes
self.options = options
self.extra = extra
self.report_dir = self.options.get("output", "./")
self.cache_dir = os.path.join(self.report_dir, "cache")
self.results_dir = os.path.join(self.report_dir, "results")

@abstractmethod
def gather(self):
pass
30 changes: 17 additions & 13 deletions gatherers/rdns.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import re
from typing import Generator, List, Pattern

from gatherers.gathererabc import Gatherer

# Reverse DNS
#
# Given a path to a (local) "JSON Lines" formatted file,
Expand All @@ -25,23 +27,25 @@
number_filter = re.compile("^[\d\-]+\.")


def gather(suffixes, options, extra={}):
path = options.get("rdns")
class Gatherer(Gatherer):

def gather(self):
path = self.options.get("rdns")

if path is None:
logging.warn("--rdns is required to be a path to a local file.")
exit(1)
if path is None:
logging.warn("--rdns is required to be a path to a local file.")
exit(1)

# May become useful to allow URLs in future.
if path.startswith("http:") or path.startswith("https:"):
logging.warn("--rdns is required to be a path to a local file.")
exit(1)
# May become useful to allow URLs in future.
if path.startswith("http:") or path.startswith("https:"):
logging.warn("--rdns is required to be a path to a local file.")
exit(1)

with open(path) as lines:
logging.debug("\tReading %s..." % path)
with open(path) as lines:
logging.debug("\tReading %s..." % path)

for record in process_lines(lines, ip_filter, number_filter):
yield record
for record in process_lines(lines, ip_filter, number_filter):
yield record


def process_lines(lines: List[str], ip_filter: Pattern,
Expand Down
59 changes: 29 additions & 30 deletions gatherers/url.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,39 @@
import os
import requests
import logging

from utils import utils

# Gathers hostnames from a CSV at a given URL.
#
# --url: The URL to download. Can also be a local path.
# Will be parsed as a CSV.
import requests

from gatherers.gathererabc import Gatherer
from utils import utils

def gather(suffixes, options, extra={}, cache_dir="./cache"):
# Defaults to --url, but can be overridden.
name = extra.get("name", "url")
url = options.get(name)

if url is None:
logging.warn("A --url is required. (Can be a local path.)")
exit(1)
class Gatherer(Gatherer):

# remote URL
if url.startswith("http:") or url.startswith("https:"):
# Though it's saved in cache/, it will be downloaded every time.
remote_path = os.path.join(cache_dir, "url.csv")
def gather(self):
# Defaults to --url, but can be overridden.
name = self.extra.get("name", "url")
url = self.options.get(name)

try:
response = requests.get(url)
utils.write(response.text, remote_path)
except:
logging.error("Remote URL not downloaded successfully.")
print(utils.format_last_exception())
if url is None:
logging.warn("A --url is required. (Can be a local path.)")
exit(1)

# local path
else:
remote_path = url

for domain in utils.load_domains(remote_path):
yield domain
# remote URL
if url.startswith("http:") or url.startswith("https:"):
# Though it's saved in cache/, it will be downloaded every time.
remote_path = os.path.join(self.cache_dir, "url.csv")

try:
response = requests.get(url)
utils.write(response.text, remote_path)
except:
logging.error("Remote URL not downloaded successfully.")
print(utils.format_last_exception())
exit(1)

# local path
else:
remote_path = url

for domain in utils.load_domains(remote_path):
yield domain

0 comments on commit d227489

Please sign in to comment.