Skip to content

Commit

Permalink
Handle the case of many unknown CDP values (NotEnoughData exception).
Browse files Browse the repository at this point in the history
Also, added a checker based on `curl` collectd plugin
  • Loading branch information
drmalex07 committed Dec 9, 2015
1 parent 60ab8ba commit b8b9cc1
Show file tree
Hide file tree
Showing 22 changed files with 433 additions and 169 deletions.
13 changes: 9 additions & 4 deletions alerts/config-example.ini
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,20 @@ foo = alerts.lib.checkers.foo:Checker

[alerts]

check = cpu memory df
check = cpu memory df curl

# specify in jiffies
cpu.usage_level = 85
cpu.interval = 1800
cpu.interval = 1200
cpu.resolution = 120

# specify as percentage
memory.usage_level = 95
memory.interval = 300
memory.interval = 360
memory.resolution = 60

# specify as percentage
df.usage_level = 95
df.usage_level = 90
df.interval = 1200
df.resolution = 600

Expand All @@ -31,6 +31,11 @@ nginx.active_connections = 100
foo.bar = baz
foo.usage_level = 0.9

# specify as seconds
curl.max_response_time = 5
curl.interval = 120
curl.resolution = 10

[mailer]

smtp_host = mail.localdomain
Expand Down
1 change: 1 addition & 0 deletions alerts/lib/checkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,4 @@ def check(self, hostname):
from . import cpu
from . import memory
from . import df
from . import curl
31 changes: 26 additions & 5 deletions alerts/lib/checkers/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from alerts import template_loader
from alerts.lib import Message
from alerts.lib.collected_stats import Stats as BaseStats
from alerts.lib.collected_stats import NoData, NotEnoughData
from alerts.lib.checkers import BaseChecker, named_checker

class Stats(BaseStats):
Expand All @@ -24,23 +25,41 @@ def __init__(self):
self.resolution = None
return

## IChecker interface ##

def setup(self, collection_dir, logger, opts):

BaseChecker.setup(self, collection_dir, logger, opts)
self.max_level = int(opts.get('usage_level', 85)) # jiffies
self.start = '-%ds' % (int(opts.get('interval', 1800)))
self.resolution = '%d' % (int(opts.get('resolution', 60)))
return

def check(self, hostname):

log1 = self.get_logger(hostname)
data_dir = self.data_dir(hostname)

max_u = self.max_level

n = self._find_number_of_cpus(data_dir)
n = self.find_number_of_cpus(data_dir)
for i in range(0, n):
u = self.get_usage(data_dir, i, 'user')
log1.debug('Computed usage for CPU #%d: %.2f', i, u)
try:
u = self.get_usage(data_dir, i, 'user')
except NotEnoughData as ex:
tpl = template_loader.load('not-enough-data.html')
msg_body = tpl.generate(
hostname = hostname,
exc_message = str(ex),
generated_at = datetime.datetime.now())
msg = Message(
title = u'Not enough data for processor usage at %s' % (hostname),
summary = u'Not enough data for CPU #%d: Skipping' % (i),
body = msg_body.render('html'))
log1.warn(msg)
continue # skip to next processor
else:
log1.debug('Computed usage for CPU #%d: %.2f', i, u)
if u > max_u:
tpl = template_loader.load('cpu.excessive-usage.html')
msg_body = tpl.generate(
Expand All @@ -61,14 +80,16 @@ def check(self, hostname):
body = None)
log1.info(msg)
return


## Helpers ##

def get_usage(self, data_dir, cpu_number, state='user'):
rrd_file = os.path.join(data_dir, 'cpu-%d/cpu-%s.rrd' % (cpu_number, state))
stats = Stats(rrd_file)
return stats.avg('value', self.start, self.resolution)

@staticmethod
def _find_number_of_cpus(data_dir):
def find_number_of_cpus(data_dir):
max_i = -1
for f in os.listdir(data_dir):
m = re.match('^cpu-([0-9][0-9]?)$', f)
Expand Down
106 changes: 106 additions & 0 deletions alerts/lib/checkers/curl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import os
import re
import datetime
import zope.interface
from thrush import rrd

from alerts import template_loader
from alerts.lib import Message
from alerts.lib.collected_stats import Stats as BaseStats
from alerts.lib.collected_stats import NoData, NotEnoughData
from alerts.lib.checkers import BaseChecker, named_checker

class Stats(BaseStats):

UNKNOWN_CDP_RATIO = 0.4

class RRD(rrd.RRD):
value = rrd.Gauge(heartbeat=20)

@named_checker('curl')
class Checker(BaseChecker):

def __init__(self):
BaseChecker.__init__(self)
self.max_response_time = None
self.resolution = None
self.start = None
return

## IChecker interface ##

def setup(self, collection_dir, logger, opts):

BaseChecker.setup(self, collection_dir, logger, opts)
self.max_response_time = 1e3 * float(opts.get('max_response_time', 8)) # milliseconds
self.start = '-%ds' % (int(opts.get('interval', 300)))
self.resolution = '%d' % (int(opts.get('resolution', 40)))
return

def check(self, hostname):

log1 = self.get_logger(hostname)
data_dir = self.data_dir(hostname)

max_u = self.max_response_time

for page_name in self.find_page_names(data_dir):
try:
u = 1e3 * self.get_response_time(data_dir, page_name)
except NotEnoughData as ex:
tpl = template_loader.load('curl.non-responsive-page.html')
msg_body = tpl.generate(
hostname = hostname,
page_name = page_name,
exc_message = str(ex),
generated_at = datetime.datetime.now())
msg = Message(
title = u'The page "%s" is not responsive' % (page_name),
summary = u'The page "%s" is not responsive: Skipping' % (page_name),
body = msg_body.render('html'))
log1.warn(msg)
continue # skip to next page
else:
log1.debug('Computed response time for page "%s": %.1fms', page_name, u)

if u > max_u:
tpl = template_loader.load('curl.sluggish-page.html')
msg_body = tpl.generate(
hostname = hostname,
page_name = page_name,
max_response_time = '%.1fms' % (max_u),
avg_response_time = '%.1fms' % (u),
generated_at = datetime.datetime.now())
msg = Message(
title = u'The page "%s" takes too long to respond' % (page_name),
summary = u'Check if page "%s" responsive: FAILED (%.1fms > %.1fms)' % (
page_name, u, max_u),
body = msg_body.render('html'))
log1.warn(msg)
else:
msg = Message(
title = u'Check if page "%s" responsive' %(page_name),
summary = u'Check if page "%s" responsive: OK (%.1fms < %.1fms)' % (
page_name, u, max_u),
body = None)
log1.info(msg)
pass

return

## Helpers ##

def get_response_time(self, data_dir, page_name):
rrd_file = os.path.join(data_dir, 'curl-%s/response_time.rrd' % (page_name))
stats = Stats(rrd_file)
return stats.avg('value', self.start, self.resolution)

@staticmethod
def find_page_names(data_dir):
res = []
for f in os.listdir(data_dir):
m = re.match('^curl-(.*)$', f)
if m:
res.append(m.group(1))
return res

60 changes: 48 additions & 12 deletions alerts/lib/checkers/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from alerts import template_loader
from alerts.lib import Message
from alerts.lib.collected_stats import Stats as BaseStats
from alerts.lib.collected_stats import NoData, NotEnoughData
from alerts.lib.checkers import BaseChecker, named_checker

class Stats(BaseStats):
Expand All @@ -33,23 +34,42 @@ def __init__(self):
self.resolution = None
return

## IChecker interface ##

def setup(self, collection_dir, logger, opts):

BaseChecker.setup(self, collection_dir, logger, opts)
self.max_level = int(opts.get('usage_level', 90)) #
self.start = '-%ds' % (int(opts.get('interval', 1200)))
self.resolution = '%d' % (int(opts.get('resolution', 600)))
return

def check(self, hostname):

log1 = self.get_logger(hostname)
data_dir = self.data_dir(hostname)

max_u = self.max_level

fs_names = self._find_fs_names(data_dir)
fs_names = self.find_fs_names(data_dir)

# Check space

for name in fs_names:
# Check space
uv = self.get_usage_of_space(data_dir, name)
try:
uv = self.get_usage_of_space(data_dir, name)
except NotEnoughData as ex:
tpl = template_loader.load('not-enough-data.html')
msg_body = tpl.generate(
hostname = hostname,
exc_message = str(ex),
generated_at = datetime.datetime.now())
msg = Message(
title = u'Not enough data for filesystem usage at %s' % (hostname),
summary = u'Not enough data for filesystem <%s>: Skipping' % (name),
body = msg_body.render('html'))
log1.warn(msg)
continue # skip to next filesystem
u = uv.as_percentage()
if u > max_u:
tpl = template_loader.load('df.excessive-usage.html')
Expand All @@ -61,18 +81,33 @@ def check(self, hostname):
generated_at = datetime.datetime.now())
msg = Message(
title = u'Running out of space at %s' %(hostname),
summary = u'Check df space at `%s`: FAILED (%.1f > %.1f)' %(name, u, max_u),
summary = u'Check df space at <%s>: FAILED (%.1f > %.1f)' %(name, u, max_u),
body = msg_body.render('html'))
log1.warn(msg)
else:
msg = Message(
title = u'Checking df space at %s' % (hostname),
summary = u'Check df space at `%s`: OK (%.1f < %.1f)' % (name, u, max_u),
summary = u'Check df space at <%s>: OK (%.1f < %.1f)' % (name, u, max_u),
body = None)
log1.info(msg)

# Check inodes
uv = self.get_usage_of_inodes(data_dir, name)
# Check inodes

for name in fs_names:
try:
uv = self.get_usage_of_inodes(data_dir, name)
except NotEnoughData as ex:
tpl = template_loader.load('not-enough-data.html')
msg_body = tpl.generate(
hostname = hostname,
exc_message = str(ex),
generated_at = datetime.datetime.now())
msg = Message(
title = u'Not enough data for inodes usage at %s' % (hostname),
summary = u'Not enough data for inodes at <%s>: Skipping' % (name),
body = msg_body.render('html'))
log1.warn(msg)
continue # skip to next filesystem
u = uv.as_percentage()
if u > max_u:
tpl = template_loader.load('df.excessive-usage-of-inodes.html')
Expand All @@ -84,18 +119,19 @@ def check(self, hostname):
generated_at = datetime.datetime.now())
msg = Message(
title = u'Running out of inodes at %s' %(hostname),
summary = u'Check df inodes at `%s`: FAILED (%.1f > %.1f)' %(name, u, max_u),
summary = u'Check df inodes at <%s>: FAILED (%.1f > %.1f)' %(name, u, max_u),
message = msg_body.render('html'))
log1.warn(msg)
else:
msg = Message(
title = u'Checking df inodes at %s' % (hostname),
summary = u'Check df inodes at `%s`: OK (%.1f < %.1f)' % (name, u, max_u),
summary = u'Check df inodes at <%s>: OK (%.1f < %.1f)' % (name, u, max_u),
body = None)
log1.info(msg)

return


## Helpers ##

def get_usage_of_space(self, data_dir, fs_name):

rrd_file = os.path.join(data_dir, 'df-%s/df_complex-free.rrd' % (fs_name))
Expand Down Expand Up @@ -135,7 +171,7 @@ def get_usage_of_inodes(self, data_dir, fs_name):
reserved = reserved_inodes)

@staticmethod
def _find_fs_names(data_dir):
def find_fs_names(data_dir):
res = []
for f in os.listdir(data_dir):
m = re.match('^df-(.*)$', f)
Expand Down
10 changes: 8 additions & 2 deletions alerts/lib/checkers/foo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@ def __init__(self):
BaseChecker.__init__(self)
self.opts = None

## IChecker interface ##

def setup(self, collection_dir, logger, opts):

BaseChecker.setup(self, collection_dir, logger, opts)
self.opts = opts.copy()

return

def check(self, hostname):

log1 = self.get_logger(hostname)
data_dir = self.data_dir(hostname)

log1.info('Checking foo (data_dir is %s)', data_dir)

return
6 changes: 6 additions & 0 deletions alerts/lib/checkers/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,18 @@ def __init__(self):
self.resolution = None
return

## IChecker interface ##

def setup(self, collection_dir, logger, opts):

BaseChecker.setup(self, collection_dir, logger, opts)
self.max_level = int(opts.get('usage_level')) # units??
self.start = '-%ds' % (int(opts.get('interval', 1200)))
self.resolution = '%d' % (int(opts.get('resolution', 120)))
return

def check(self, hostname):

log1 = self.get_logger(hostname)
data_dir = self.data_dir(hostname)

Expand All @@ -43,6 +47,8 @@ def check(self, hostname):

return

## Helpers ##

def get_usage(self, data_dir):
rrd_file = os.path.join(data_dir, 'load/load.rrd')
stats = Stats(rrd_file)
Expand Down
Loading

0 comments on commit b8b9cc1

Please sign in to comment.