Merge pull request #229 from 18F/gather-abc

Abstract base class approach for gatherers
18F · Mar 31, 2018 · d227489 · d227489
2 parents 331d3fc + e0a9585
commit d227489
Show file tree

Hide file tree

Showing 5 changed files with 149 additions and 124 deletions.
diff --git a/gather b/gather
@@ -54,21 +54,24 @@ def run(options=None, cache_dir="./cache", results_dir="./results"):
         extra = {}
 
         try:
-            gatherer = importlib.import_module("gatherers.%s" % source)
+            gatherer_module = importlib.import_module(
+                "gatherers.%s" % source)
+            gatherer = gatherer_module.Gatherer(suffixes, options, extra)
         except ImportError:
             # If it's not a registered module, allow it to be "hot registered"
             # as long as the user gave us a flag with that name that can be
             # used as the --url option to the URL module.
             if options.get(source):
-                gatherer = importlib.import_module("gatherers.url")
+                gatherer_module = importlib.import_module("gatherers.url")
                 extra['name'] = source
+                gatherer = gatherer_module.Gatherer(suffixes, options, extra)
             else:
                 exc_type, exc_value, exc_traceback = sys.exc_info()
                 logging.error("[%s] Gatherer not found, or had an error during loading.\n\tERROR: %s\n\t%s" % (source, exc_type, exc_value))
                 exit(1)
 
         # Iterate over each hostname.
-        for domain in gatherer.gather(suffixes, options, extra):
+        for domain in gatherer.gather():
 
             # Always apply the suffix filter to returned names.
             if not suffix_pattern.search(domain):

diff --git a/gatherers/censys.py b/gatherers/censys.py
@@ -1,13 +1,14 @@
-import os
-import json
 import csv
+import json
 import logging
+import os
 from typing import List
 
 from google.cloud import bigquery
 from google.oauth2 import service_account
 import google.api_core.exceptions
 
+from gatherers.gathererabc import Gatherer
 from utils import utils
 
 # Options:
@@ -36,82 +37,82 @@
 default_timeout = 60 * 60 * 10
 
 
-def gather(suffixes, options, extra={}):
-
-    # Returns a parsed, processed Google service credentials object.
-    credentials = load_credentials()
-
-    if credentials is None:
-        logging.warn("No BigQuery credentials provided.")
-        logging.warn("Set BIGQUERY_CREDENTIALS or BIGQUERY_CREDENTIALS_PATH environment variables.")
-        exit(1)
-
-    # When using this form of instantiation, the client won't pull
-    # the project_id out of the creds, has to be set explicitly.
-    client = bigquery.Client(
-        project=credentials.project_id,
-        credentials=credentials
-    )
-
-    # Allow override of default timeout (in seconds).
-    timeout = int(options.get("timeout", default_timeout))
-
-    # Construct the query.
-    query = query_for(suffixes)
-    logging.debug("Censys query:\n%s\n" % query)
-
-    # Hardcode this for now:
-    cache_dir = "./cache"
-    # Plan to store in cache/censys/export.csv.
-    download_path = utils.cache_path("export", "censys", ext="csv",
-                                     cache_dir=cache_dir)
-
-    # Reuse of cached data can be turned on with --cache.
-    cache = options.get("cache", False)
-    if (cache is True) and os.path.exists(download_path):
-        logging.warn("Using cached download data.")
-
-    # But by default, fetch new data from the BigQuery API,
-    # and write it to the expected download location.
-    else:
-        logging.warn("Kicking off SQL query job.")
-
-        rows = None
-
-        # Actually execute the query.
-        try:
-            # Executes query and loads all results into memory.
-            query_job = client.query(query)
-            iterator = query_job.result(timeout=timeout)
-            rows = list(iterator)
-        except google.api_core.exceptions.Forbidden:
-            logging.warn("Access denied to Censys' BigQuery tables.")
-        except:
-            logging.warn(utils.format_last_exception())
-            logging.warn("Error talking to BigQuery, aborting.")
-
-        # At this point, the job is complete and we need to download
-        # the resulting CSV URL in results_url.
-        logging.warn("Caching results of SQL query.")
-
-        download_file = open(download_path, 'w', newline='')
-        download_writer = csv.writer(download_file)
-        download_writer.writerow(["Domain"])  # will be skipped on read
-
-        # Parse the rows and write them out as they were returned (dupes
-        # and all), to be de-duped by the central gathering script.
-        for row in rows:
-            domains = row['common_name'] + row['dns_names']
-            for domain in domains:
-                download_writer.writerow([domain])
-
-        # End CSV writing.
-        download_file.close()
-
-    # Whether we downloaded it fresh or not, read from the cached data.
-    for domain in utils.load_domains(download_path):
-        if domain:
-            yield domain
+class Gatherer(Gatherer):
+
+    def gather(self):
+
+        # Returns a parsed, processed Google service credentials object.
+        credentials = load_credentials()
+
+        if credentials is None:
+            logging.warn("No BigQuery credentials provided.")
+            logging.warn("Set BIGQUERY_CREDENTIALS or BIGQUERY_CREDENTIALS_PATH environment variables.")
+            exit(1)
+
+        # When using this form of instantiation, the client won't pull
+        # the project_id out of the creds, has to be set explicitly.
+        client = bigquery.Client(
+            project=credentials.project_id,
+            credentials=credentials
+        )
+
+        # Allow override of default timeout (in seconds).
+        timeout = int(self.options.get("timeout", default_timeout))
+
+        # Construct the query.
+        query = query_for(self.suffixes)
+        logging.debug("Censys query:\n%s\n" % query)
+
+        # Plan to store in cache/censys/export.csv.
+        download_path = utils.cache_path(
+            "export", "censys", ext="csv", cache_dir=self.cache_dir)
+
+        # Reuse of cached data can be turned on with --cache.
+        cache = self.options.get("cache", False)
+        if (cache is True) and os.path.exists(download_path):
+            logging.warn("Using cached download data.")
+
+        # But by default, fetch new data from the BigQuery API,
+        # and write it to the expected download location.
+        else:
+            logging.warn("Kicking off SQL query job.")
+
+            rows = None
+
+            # Actually execute the query.
+            try:
+                # Executes query and loads all results into memory.
+                query_job = client.query(query)
+                iterator = query_job.result(timeout=timeout)
+                rows = list(iterator)
+            except google.api_core.exceptions.Forbidden:
+                logging.warn("Access denied to Censys' BigQuery tables.")
+            except:
+                logging.warn(utils.format_last_exception())
+                logging.warn("Error talking to BigQuery, aborting.")
+
+            # At this point, the job is complete and we need to download
+            # the resulting CSV URL in results_url.
+            logging.warn("Caching results of SQL query.")
+
+            download_file = open(download_path, 'w', newline='')
+            download_writer = csv.writer(download_file)
+            download_writer.writerow(["Domain"])  # will be skipped on read
+
+            # Parse the rows and write them out as they were returned (dupes
+            # and all), to be de-duped by the central gathering script.
+            for row in rows:
+                domains = row['common_name'] + row['dns_names']
+                for domain in domains:
+                    download_writer.writerow([domain])
+
+            # End CSV writing.
+            download_file.close()
+
+        # Whether we downloaded it fresh or not, read from the cached data.
+        for domain in utils.load_domains(download_path):
+            if domain:
+                yield domain
 
 
 # Constructs the query to run in BigQuery, against Censys'

diff --git a/gatherers/gathererabc.py b/gatherers/gathererabc.py
@@ -0,0 +1,18 @@
+from abc import ABCMeta, abstractmethod
+import os
+from typing import List
+
+
+class Gatherer(metaclass=ABCMeta):
+
+    def __init__(self, suffixes: List[str], options: dict, extra: dict={}):
+        self.suffixes = suffixes
+        self.options = options
+        self.extra = extra
+        self.report_dir = self.options.get("output", "./")
+        self.cache_dir = os.path.join(self.report_dir, "cache")
+        self.results_dir = os.path.join(self.report_dir, "results")
+
+    @abstractmethod
+    def gather(self):
+        pass
diff --git a/gatherers/rdns.py b/gatherers/rdns.py
@@ -3,6 +3,8 @@
 import re
 from typing import Generator, List, Pattern
 
+from gatherers.gathererabc import Gatherer
+
 # Reverse DNS
 #
 # Given a path to a (local) "JSON Lines" formatted file,
@@ -25,23 +27,25 @@
 number_filter = re.compile("^[\d\-]+\.")
 
 
-def gather(suffixes, options, extra={}):
-    path = options.get("rdns")
+class Gatherer(Gatherer):
+
+    def gather(self):
+        path = self.options.get("rdns")
 
-    if path is None:
-        logging.warn("--rdns is required to be a path to a local file.")
-        exit(1)
+        if path is None:
+            logging.warn("--rdns is required to be a path to a local file.")
+            exit(1)
 
-    # May become useful to allow URLs in future.
-    if path.startswith("http:") or path.startswith("https:"):
-        logging.warn("--rdns is required to be a path to a local file.")
-        exit(1)
+        # May become useful to allow URLs in future.
+        if path.startswith("http:") or path.startswith("https:"):
+            logging.warn("--rdns is required to be a path to a local file.")
+            exit(1)
 
-    with open(path) as lines:
-        logging.debug("\tReading %s..." % path)
+        with open(path) as lines:
+            logging.debug("\tReading %s..." % path)
 
-        for record in process_lines(lines, ip_filter, number_filter):
-            yield record
+            for record in process_lines(lines, ip_filter, number_filter):
+                yield record
 
 
 def process_lines(lines: List[str], ip_filter: Pattern,

diff --git a/gatherers/url.py b/gatherers/url.py
@@ -1,40 +1,39 @@
 import os
-import requests
 import logging
 
-from utils import utils
-
-# Gathers hostnames from a CSV at a given URL.
-#
-# --url: The URL to download. Can also be a local path.
-#        Will be parsed as a CSV.
+import requests
 
+from gatherers.gathererabc import Gatherer
+from utils import utils
 
-def gather(suffixes, options, extra={}, cache_dir="./cache"):
-    # Defaults to --url, but can be overridden.
-    name = extra.get("name", "url")
-    url = options.get(name)
 
-    if url is None:
-        logging.warn("A --url is required. (Can be a local path.)")
-        exit(1)
+class Gatherer(Gatherer):
 
-    # remote URL
-    if url.startswith("http:") or url.startswith("https:"):
-        # Though it's saved in cache/, it will be downloaded every time.
-        remote_path = os.path.join(cache_dir, "url.csv")
+    def gather(self):
+        # Defaults to --url, but can be overridden.
+        name = self.extra.get("name", "url")
+        url = self.options.get(name)
 
-        try:
-            response = requests.get(url)
-            utils.write(response.text, remote_path)
-        except:
-            logging.error("Remote URL not downloaded successfully.")
-            print(utils.format_last_exception())
+        if url is None:
+            logging.warn("A --url is required. (Can be a local path.)")
             exit(1)
 
-    # local path
-    else:
-        remote_path = url
-
-    for domain in utils.load_domains(remote_path):
-        yield domain
+        # remote URL
+        if url.startswith("http:") or url.startswith("https:"):
+            # Though it's saved in cache/, it will be downloaded every time.
+            remote_path = os.path.join(self.cache_dir, "url.csv")
+
+            try:
+                response = requests.get(url)
+                utils.write(response.text, remote_path)
+            except:
+                logging.error("Remote URL not downloaded successfully.")
+                print(utils.format_last_exception())
+                exit(1)
+
+        # local path
+        else:
+            remote_path = url
+
+        for domain in utils.load_domains(remote_path):
+            yield domain