Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refact(noisy): handle UnicodeDecodeError, add Makefile #26

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:2.7-alpine
FROM jfloff/alpine-python:2.7-slim
WORKDIR /
COPY requirements.txt .
RUN pip install -r requirements.txt
Expand Down
17 changes: 17 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
.PHONY: run multi-run install-systemd-service help

run:
python noisy.py --config config.json

multi-run:
bash -c "trap 'docker-compose down' EXIT; cd examples/docker-compose && docker-compose up --build --scale noisy=$(scale)"

install-systemd-service:
sudo cp examples/systemd/noisy.service /etc/systemd/system
sudo systemctl daemon-reload
sudo systemctl enable noisy && sudo systemctl start noisy
echo "you can check the output with `journalctl -f -n noisy`"

help:
python noisy.py --help

10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@

# Noisy
[![CircleCI](https://circleci.com/gh/1tayH/noisy/tree/master.svg?style=shield)](https://circleci.com/gh/1tayH/noisy/tree/master)
<a href="https://github.com/ambv/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>


A simple python script that generates random HTTP/DNS traffic noise in the background while you go about your regular web browsing, to make your web traffic data less valuable for selling and for extra obscurity.

Tested on MacOS High Sierra, Ubuntu 16.04 and Raspbian Stretch and is compatable with both Python 2.7 and 3.6
Tested on MacOS High Sierra, Ubuntu 16.04 and Raspbian Stretch and is compatible with both Python 2.7 and 3.6

## Getting Started

Expand Down Expand Up @@ -34,6 +36,10 @@ Run the script

```
python noisy.py --config config.json

# or

make run
```

The program can accept a number of command line arguments:
Expand Down Expand Up @@ -90,6 +96,8 @@ DEBUG:urllib3.connectionpool:https://www.reddit.com:443 "GET /user/Saditon HTTP/

`docker run -it noisy --config config.json`

To further simplify starting a few containers with noisy you can run `make scale=[num of containers] multi-run`.

## Some examples

Some edge-cases examples are available on the `examples` folder. You can read more there [examples/README.md](examples/README.md).
Expand Down
10 changes: 10 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,13 @@ You can view the script's output by running:
```
$ journalctl -f -n noisy
```

## Bulk update root urls

You can further tweak the noise generation by adding more root urls to visit eg.: Alexa top 1 million sites.
An example bash script which does this can be found here
You can run it from the project folder:

```bash
curl -sSL https://gist.githubusercontent.com/szepnapot/6fffd93688556f97e4a8f79837a1c0ca/raw/b7853a7038d1967dbe8fa94ff85ef5624a8d27d1/update_root_urls.sh | bash
```
99 changes: 68 additions & 31 deletions noisy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@
import requests
from urllib3.exceptions import LocationParseError

try: # Python 2
try: # Python 2
from urllib.parse import urljoin, urlparse
except ImportError: # Python 3
from urlparse import urljoin, urlparse

try: # Python 2
try: # Python 2
reload(sys)
sys.setdefaultencoding('latin-1')
except NameError: # Python 3
sys.setdefaultencoding("latin-1")
except NameError: # Python 3
pass


Expand All @@ -35,6 +35,7 @@ class CrawlerTimedOut(Exception):
"""
Raised when the specified timeout is exceeded
"""

pass

def _request(self, url):
Expand All @@ -44,10 +45,9 @@ def _request(self, url):
:return: the response Requests object
"""
random_user_agent = random.choice(self._config["user_agents"])
headers = {'user-agent': random_user_agent}
headers = {"user-agent": random_user_agent}

response = requests.get(url, headers=headers, timeout=5)

return response

@staticmethod
Expand All @@ -70,7 +70,9 @@ def _normalize_link(link, root_url):

# '//' means keep the current protocol used to access this URL
if link.startswith("//"):
return "{}://{}{}".format(parsed_root_url.scheme, parsed_url.netloc, parsed_url.path)
return "{}://{}{}".format(
parsed_root_url.scheme, parsed_url.netloc, parsed_url.path
)

# possibly a relative path
if not parsed_url.scheme:
Expand All @@ -89,11 +91,13 @@ def _is_valid_url(url):
:return: boolean indicating whether the URL is valid or not
"""
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
r"^(?:http|ftp)s?://" # http:// or https://
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
r"(?::\d+)?" # optional port
r"(?:/?|[/?]\S+)$",
re.IGNORECASE,
)
return re.match(regex, url) is not None

def _is_blacklisted(self, url):
Expand All @@ -102,7 +106,10 @@ def _is_blacklisted(self, url):
:param url: full URL
:return: boolean indicating whether a URL is blacklisted or not
"""
return any(blacklisted_url in url for blacklisted_url in self._config["blacklisted_urls"])
return any(
blacklisted_url in url
for blacklisted_url in self._config["blacklisted_urls"]
)

def _should_accept_url(self, url):
"""
Expand All @@ -120,7 +127,9 @@ def _extract_urls(self, body, root_url):
:param root_url: the root URL of the given body
:return: list of extracted links
"""
pattern = r"href=[\"'](?!#)(.*?)[\"'].*?" # ignore links starting with #, no point in re-visiting the same page
pattern = (
r"href=[\"'](?!#)(.*?)[\"'].*?"
) # ignore links starting with #, no point in re-visiting the same page
urls = re.findall(pattern, str(body))

normalize_urls = [self._normalize_link(url, root_url) for url in urls]
Expand All @@ -134,7 +143,9 @@ def _remove_and_blacklist(self, link):
and blacklists it so we don't visit it in the future
:param link: link to remove and blacklist
"""
self._config['blacklisted_urls'].append(link)
if link not in self._links:
return
self._config["blacklisted_urls"].append(link)
del self._links[self._links.index(link)]

def _browse_from_links(self, depth=0):
Expand All @@ -145,7 +156,7 @@ def _browse_from_links(self, depth=0):
a dead end has reached or when we ran out of links
:param depth: our current link depth
"""
is_depth_reached = depth >= self._config['max_depth']
is_depth_reached = depth >= self._config["max_depth"]
if not len(self._links) or is_depth_reached:
logging.debug("Hit a dead end, moving to the next root URL")
# escape from the recursion, we don't have links to continue or we have reached the max depth
Expand All @@ -161,7 +172,9 @@ def _browse_from_links(self, depth=0):
sub_links = self._extract_urls(sub_page, random_link)

# sleep for a random amount of time
time.sleep(random.randrange(self._config["min_sleep"], self._config["max_sleep"]))
time.sleep(
random.randrange(self._config["min_sleep"], self._config["max_sleep"])
)

# make sure we have more than 1 link to pick from
if len(sub_links) > 1:
Expand All @@ -173,7 +186,10 @@ def _browse_from_links(self, depth=0):
self._remove_and_blacklist(random_link)

except requests.exceptions.RequestException:
logging.debug("Exception on URL: %s, removing from list and trying again!" % random_link)
logging.debug(
"Exception on URL: %s, removing from list and trying again!"
% random_link
)
self._remove_and_blacklist(random_link)

self._browse_from_links(depth + 1)
Expand All @@ -185,7 +201,7 @@ def load_config_file(self, file_path):
:param file_path: path of the config file
:return:
"""
with open(file_path, 'r') as config_file:
with open(file_path, "r") as config_file:
config = json.load(config_file)
self.set_config(config)

Expand Down Expand Up @@ -216,8 +232,12 @@ def _is_timeout_reached(self):
is specified then return false
:return: boolean indicating whether the timeout has reached
"""
is_timeout_set = self._config["timeout"] is not False # False is set when no timeout is desired
end_time = self._start_time + datetime.timedelta(seconds=self._config["timeout"])
is_timeout_set = (
self._config["timeout"] is not False
) # False is set when no timeout is desired
end_time = self._start_time + datetime.timedelta(
seconds=self._config["timeout"]
)
is_timed_out = datetime.datetime.now() >= end_time

return is_timeout_set and is_timed_out
Expand All @@ -237,25 +257,42 @@ def crawl(self):
logging.debug("found {} links".format(len(self._links)))
self._browse_from_links()

except UnicodeDecodeError:
logging.warning("Error decoding root url: {}".format(url))
self._remove_and_blacklist(url)

except requests.exceptions.RequestException:
logging.warn("Error connecting to root url: {}".format(url))
logging.warning("Error connecting to root url: {}".format(url))

except MemoryError:
logging.warn("Error: content at url: {} is exhausting the memory".format(url))
logging.warning(
"Error: content at url: {} is exhausting the memory".format(url)
)

except LocationParseError:
logging.warn("Error encountered during parsing of: {}".format(url))
logging.warning("Error encountered during parsing of: {}".format(url))

except self.CrawlerTimedOut:
logging.info("Timeout has exceeded, exiting")
return


def main():
parser = argparse.ArgumentParser()
parser.add_argument('--log', metavar='-l', type=str, help='logging level', default='info')
parser.add_argument('--config', metavar='-c', required=True, type=str, help='config file')
parser.add_argument('--timeout', metavar='-t', required=False, type=int,
help='for how long the crawler should be running, in seconds', default=False)
parser.add_argument(
"--log", metavar="-l", type=str, help="logging level", default="info"
)
parser.add_argument(
"--config", metavar="-c", required=True, type=str, help="config file"
)
parser.add_argument(
"--timeout",
metavar="-t",
required=False,
type=int,
help="for how long the crawler should be running, in seconds",
default=False,
)
args = parser.parse_args()

level = getattr(logging, args.log.upper())
Expand All @@ -265,10 +302,10 @@ def main():
crawler.load_config_file(args.config)

if args.timeout:
crawler.set_option('timeout', args.timeout)
crawler.set_option("timeout", args.timeout)

crawler.crawl()


if __name__ == '__main__':
if __name__ == "__main__":
main()