diff --git a/alerts/config-example.ini b/alerts/config-example.ini index 98ade2f..d32a177 100644 --- a/alerts/config-example.ini +++ b/alerts/config-example.ini @@ -8,20 +8,20 @@ foo = alerts.lib.checkers.foo:Checker [alerts] -check = cpu memory df +check = cpu memory df curl # specify in jiffies cpu.usage_level = 85 -cpu.interval = 1800 +cpu.interval = 1200 cpu.resolution = 120 # specify as percentage memory.usage_level = 95 -memory.interval = 300 +memory.interval = 360 memory.resolution = 60 # specify as percentage -df.usage_level = 95 +df.usage_level = 90 df.interval = 1200 df.resolution = 600 @@ -31,6 +31,11 @@ nginx.active_connections = 100 foo.bar = baz foo.usage_level = 0.9 +# specify as seconds +curl.max_response_time = 5 +curl.interval = 120 +curl.resolution = 10 + [mailer] smtp_host = mail.localdomain diff --git a/alerts/lib/checkers/__init__.py b/alerts/lib/checkers/__init__.py index 5d1632f..a7e0d2c 100644 --- a/alerts/lib/checkers/__init__.py +++ b/alerts/lib/checkers/__init__.py @@ -73,3 +73,4 @@ def check(self, hostname): from . import cpu from . import memory from . import df +from . import curl diff --git a/alerts/lib/checkers/cpu.py b/alerts/lib/checkers/cpu.py index 39b67a3..b714822 100644 --- a/alerts/lib/checkers/cpu.py +++ b/alerts/lib/checkers/cpu.py @@ -7,6 +7,7 @@ from alerts import template_loader from alerts.lib import Message from alerts.lib.collected_stats import Stats as BaseStats +from alerts.lib.collected_stats import NoData, NotEnoughData from alerts.lib.checkers import BaseChecker, named_checker class Stats(BaseStats): @@ -24,7 +25,10 @@ def __init__(self): self.resolution = None return + ## IChecker interface ## + def setup(self, collection_dir, logger, opts): + BaseChecker.setup(self, collection_dir, logger, opts) self.max_level = int(opts.get('usage_level', 85)) # jiffies self.start = '-%ds' % (int(opts.get('interval', 1800))) @@ -32,15 +36,30 @@ def setup(self, collection_dir, logger, opts): return def check(self, hostname): + log1 = self.get_logger(hostname) data_dir = self.data_dir(hostname) max_u = self.max_level - n = self._find_number_of_cpus(data_dir) + n = self.find_number_of_cpus(data_dir) for i in range(0, n): - u = self.get_usage(data_dir, i, 'user') - log1.debug('Computed usage for CPU #%d: %.2f', i, u) + try: + u = self.get_usage(data_dir, i, 'user') + except NotEnoughData as ex: + tpl = template_loader.load('not-enough-data.html') + msg_body = tpl.generate( + hostname = hostname, + exc_message = str(ex), + generated_at = datetime.datetime.now()) + msg = Message( + title = u'Not enough data for processor usage at %s' % (hostname), + summary = u'Not enough data for CPU #%d: Skipping' % (i), + body = msg_body.render('html')) + log1.warn(msg) + continue # skip to next processor + else: + log1.debug('Computed usage for CPU #%d: %.2f', i, u) if u > max_u: tpl = template_loader.load('cpu.excessive-usage.html') msg_body = tpl.generate( @@ -61,14 +80,16 @@ def check(self, hostname): body = None) log1.info(msg) return - + + ## Helpers ## + def get_usage(self, data_dir, cpu_number, state='user'): rrd_file = os.path.join(data_dir, 'cpu-%d/cpu-%s.rrd' % (cpu_number, state)) stats = Stats(rrd_file) return stats.avg('value', self.start, self.resolution) @staticmethod - def _find_number_of_cpus(data_dir): + def find_number_of_cpus(data_dir): max_i = -1 for f in os.listdir(data_dir): m = re.match('^cpu-([0-9][0-9]?)$', f) diff --git a/alerts/lib/checkers/curl.py b/alerts/lib/checkers/curl.py new file mode 100644 index 0000000..bee3d65 --- /dev/null +++ b/alerts/lib/checkers/curl.py @@ -0,0 +1,106 @@ +import os +import re +import datetime +import zope.interface +from thrush import rrd + +from alerts import template_loader +from alerts.lib import Message +from alerts.lib.collected_stats import Stats as BaseStats +from alerts.lib.collected_stats import NoData, NotEnoughData +from alerts.lib.checkers import BaseChecker, named_checker + +class Stats(BaseStats): + + UNKNOWN_CDP_RATIO = 0.4 + + class RRD(rrd.RRD): + value = rrd.Gauge(heartbeat=20) + +@named_checker('curl') +class Checker(BaseChecker): + + def __init__(self): + BaseChecker.__init__(self) + self.max_response_time = None + self.resolution = None + self.start = None + return + + ## IChecker interface ## + + def setup(self, collection_dir, logger, opts): + + BaseChecker.setup(self, collection_dir, logger, opts) + self.max_response_time = 1e3 * float(opts.get('max_response_time', 8)) # milliseconds + self.start = '-%ds' % (int(opts.get('interval', 300))) + self.resolution = '%d' % (int(opts.get('resolution', 40))) + return + + def check(self, hostname): + + log1 = self.get_logger(hostname) + data_dir = self.data_dir(hostname) + + max_u = self.max_response_time + + for page_name in self.find_page_names(data_dir): + try: + u = 1e3 * self.get_response_time(data_dir, page_name) + except NotEnoughData as ex: + tpl = template_loader.load('curl.non-responsive-page.html') + msg_body = tpl.generate( + hostname = hostname, + page_name = page_name, + exc_message = str(ex), + generated_at = datetime.datetime.now()) + msg = Message( + title = u'The page "%s" is not responsive' % (page_name), + summary = u'The page "%s" is not responsive: Skipping' % (page_name), + body = msg_body.render('html')) + log1.warn(msg) + continue # skip to next page + else: + log1.debug('Computed response time for page "%s": %.1fms', page_name, u) + + if u > max_u: + tpl = template_loader.load('curl.sluggish-page.html') + msg_body = tpl.generate( + hostname = hostname, + page_name = page_name, + max_response_time = '%.1fms' % (max_u), + avg_response_time = '%.1fms' % (u), + generated_at = datetime.datetime.now()) + msg = Message( + title = u'The page "%s" takes too long to respond' % (page_name), + summary = u'Check if page "%s" responsive: FAILED (%.1fms > %.1fms)' % ( + page_name, u, max_u), + body = msg_body.render('html')) + log1.warn(msg) + else: + msg = Message( + title = u'Check if page "%s" responsive' %(page_name), + summary = u'Check if page "%s" responsive: OK (%.1fms < %.1fms)' % ( + page_name, u, max_u), + body = None) + log1.info(msg) + pass + + return + + ## Helpers ## + + def get_response_time(self, data_dir, page_name): + rrd_file = os.path.join(data_dir, 'curl-%s/response_time.rrd' % (page_name)) + stats = Stats(rrd_file) + return stats.avg('value', self.start, self.resolution) + + @staticmethod + def find_page_names(data_dir): + res = [] + for f in os.listdir(data_dir): + m = re.match('^curl-(.*)$', f) + if m: + res.append(m.group(1)) + return res + diff --git a/alerts/lib/checkers/df.py b/alerts/lib/checkers/df.py index c19745a..2f5f63e 100644 --- a/alerts/lib/checkers/df.py +++ b/alerts/lib/checkers/df.py @@ -8,6 +8,7 @@ from alerts import template_loader from alerts.lib import Message from alerts.lib.collected_stats import Stats as BaseStats +from alerts.lib.collected_stats import NoData, NotEnoughData from alerts.lib.checkers import BaseChecker, named_checker class Stats(BaseStats): @@ -33,7 +34,10 @@ def __init__(self): self.resolution = None return + ## IChecker interface ## + def setup(self, collection_dir, logger, opts): + BaseChecker.setup(self, collection_dir, logger, opts) self.max_level = int(opts.get('usage_level', 90)) # self.start = '-%ds' % (int(opts.get('interval', 1200))) @@ -41,15 +45,31 @@ def setup(self, collection_dir, logger, opts): return def check(self, hostname): + log1 = self.get_logger(hostname) data_dir = self.data_dir(hostname) max_u = self.max_level - fs_names = self._find_fs_names(data_dir) + fs_names = self.find_fs_names(data_dir) + + # Check space + for name in fs_names: - # Check space - uv = self.get_usage_of_space(data_dir, name) + try: + uv = self.get_usage_of_space(data_dir, name) + except NotEnoughData as ex: + tpl = template_loader.load('not-enough-data.html') + msg_body = tpl.generate( + hostname = hostname, + exc_message = str(ex), + generated_at = datetime.datetime.now()) + msg = Message( + title = u'Not enough data for filesystem usage at %s' % (hostname), + summary = u'Not enough data for filesystem <%s>: Skipping' % (name), + body = msg_body.render('html')) + log1.warn(msg) + continue # skip to next filesystem u = uv.as_percentage() if u > max_u: tpl = template_loader.load('df.excessive-usage.html') @@ -61,18 +81,33 @@ def check(self, hostname): generated_at = datetime.datetime.now()) msg = Message( title = u'Running out of space at %s' %(hostname), - summary = u'Check df space at `%s`: FAILED (%.1f > %.1f)' %(name, u, max_u), + summary = u'Check df space at <%s>: FAILED (%.1f > %.1f)' %(name, u, max_u), body = msg_body.render('html')) log1.warn(msg) else: msg = Message( title = u'Checking df space at %s' % (hostname), - summary = u'Check df space at `%s`: OK (%.1f < %.1f)' % (name, u, max_u), + summary = u'Check df space at <%s>: OK (%.1f < %.1f)' % (name, u, max_u), body = None) log1.info(msg) - # Check inodes - uv = self.get_usage_of_inodes(data_dir, name) + # Check inodes + + for name in fs_names: + try: + uv = self.get_usage_of_inodes(data_dir, name) + except NotEnoughData as ex: + tpl = template_loader.load('not-enough-data.html') + msg_body = tpl.generate( + hostname = hostname, + exc_message = str(ex), + generated_at = datetime.datetime.now()) + msg = Message( + title = u'Not enough data for inodes usage at %s' % (hostname), + summary = u'Not enough data for inodes at <%s>: Skipping' % (name), + body = msg_body.render('html')) + log1.warn(msg) + continue # skip to next filesystem u = uv.as_percentage() if u > max_u: tpl = template_loader.load('df.excessive-usage-of-inodes.html') @@ -84,18 +119,19 @@ def check(self, hostname): generated_at = datetime.datetime.now()) msg = Message( title = u'Running out of inodes at %s' %(hostname), - summary = u'Check df inodes at `%s`: FAILED (%.1f > %.1f)' %(name, u, max_u), + summary = u'Check df inodes at <%s>: FAILED (%.1f > %.1f)' %(name, u, max_u), message = msg_body.render('html')) log1.warn(msg) else: msg = Message( title = u'Checking df inodes at %s' % (hostname), - summary = u'Check df inodes at `%s`: OK (%.1f < %.1f)' % (name, u, max_u), + summary = u'Check df inodes at <%s>: OK (%.1f < %.1f)' % (name, u, max_u), body = None) log1.info(msg) - return - + + ## Helpers ## + def get_usage_of_space(self, data_dir, fs_name): rrd_file = os.path.join(data_dir, 'df-%s/df_complex-free.rrd' % (fs_name)) @@ -135,7 +171,7 @@ def get_usage_of_inodes(self, data_dir, fs_name): reserved = reserved_inodes) @staticmethod - def _find_fs_names(data_dir): + def find_fs_names(data_dir): res = [] for f in os.listdir(data_dir): m = re.match('^df-(.*)$', f) diff --git a/alerts/lib/checkers/foo.py b/alerts/lib/checkers/foo.py index 789898d..88afc36 100644 --- a/alerts/lib/checkers/foo.py +++ b/alerts/lib/checkers/foo.py @@ -9,12 +9,18 @@ def __init__(self): BaseChecker.__init__(self) self.opts = None + ## IChecker interface ## + def setup(self, collection_dir, logger, opts): + BaseChecker.setup(self, collection_dir, logger, opts) self.opts = opts.copy() - + return + def check(self, hostname): + log1 = self.get_logger(hostname) data_dir = self.data_dir(hostname) + log1.info('Checking foo (data_dir is %s)', data_dir) - + return diff --git a/alerts/lib/checkers/load.py b/alerts/lib/checkers/load.py index e8f4067..d9f344f 100644 --- a/alerts/lib/checkers/load.py +++ b/alerts/lib/checkers/load.py @@ -26,7 +26,10 @@ def __init__(self): self.resolution = None return + ## IChecker interface ## + def setup(self, collection_dir, logger, opts): + BaseChecker.setup(self, collection_dir, logger, opts) self.max_level = int(opts.get('usage_level')) # units?? self.start = '-%ds' % (int(opts.get('interval', 1200))) @@ -34,6 +37,7 @@ def setup(self, collection_dir, logger, opts): return def check(self, hostname): + log1 = self.get_logger(hostname) data_dir = self.data_dir(hostname) @@ -43,6 +47,8 @@ def check(self, hostname): return + ## Helpers ## + def get_usage(self, data_dir): rrd_file = os.path.join(data_dir, 'load/load.rrd') stats = Stats(rrd_file) diff --git a/alerts/lib/checkers/memory.py b/alerts/lib/checkers/memory.py index 34f3178..47f365a 100644 --- a/alerts/lib/checkers/memory.py +++ b/alerts/lib/checkers/memory.py @@ -8,6 +8,7 @@ from alerts import template_loader from alerts.lib import Message from alerts.lib.collected_stats import Stats as BaseStats +from alerts.lib.collected_stats import NoData, NotEnoughData from alerts.lib.checkers import BaseChecker, named_checker class Stats(BaseStats): @@ -33,7 +34,10 @@ def __init__(self): self.resolution = None return + ## IChecker interface ## + def setup(self, collection_dir, logger, opts): + BaseChecker.setup(self, collection_dir, logger, opts) self.max_level = int(opts.get('usage_level', 90)) # percentage self.start = '-%ds' % (int(opts.get('interval', 600))) @@ -41,12 +45,26 @@ def setup(self, collection_dir, logger, opts): return def check(self, hostname): + log1 = self.get_logger(hostname) data_dir = self.data_dir(hostname) - max_u = self.max_level + try: + uv = self.get_usage(data_dir) + except NotEnoughData as ex: + tpl = template_loader.load('not-enough-data.html') + msg_body = tpl.generate( + hostname = hostname, + exc_message = str(ex), + generated_at = datetime.datetime.now()) + msg = Message( + title = u'Not enough data for memory usage at %s' % (hostname), + summary = u'Not enough data for memory: Skipping', + body = msg_body.render('html')) + log1.warn(msg) + return # skip checks - uv = self.get_usage(data_dir) + max_u = self.max_level u = uv.as_percentage() log1.debug( 'Computed memory usage: %.1f%% (%.1fMiB used, %.1fMiB free, %.1fMiB cached)', @@ -71,6 +89,8 @@ def check(self, hostname): log1.info(msg) return + ## Helpers ## + def get_usage(self, data_dir): stats = Stats(os.path.join(data_dir, 'memory/memory-free.rrd')) diff --git a/alerts/lib/collected_stats.py b/alerts/lib/collected_stats.py index 189d7b5..a0f9823 100644 --- a/alerts/lib/collected_stats.py +++ b/alerts/lib/collected_stats.py @@ -7,8 +7,21 @@ # The actual names of the following datasources (as well as their heartbeats) # can be found by running the appropriate `rrdtool info` commands +class NoData(RuntimeError): + + pass + +class NotEnoughData(RuntimeError): + + pass + class Stats(object): + + # How many unknown CDPs can we tolerate and still consider + # their aggregate (e.g AVERAGE) as meaningfull + UNKNOWN_CDP_RATIO = 0.25 + # Represent data sources (DS) inside this RRD database class RRD(rrd.RRD): value = rrd.Gauge(heartbeat=300) @@ -20,15 +33,28 @@ def __init__(self, rrd_file): return def avg(self, ds, start, resolution): - avg = None + '''Compute average in given window and resolution. + ''' + + res = None with self.db.fetch('AVERAGE', start=start, end='-0s', resolution=resolution) as res: ds_values = [] + n = 0 for t, values in res: v = values[ds] - if not (v is None) and (v == v): + n += 1 + if not (v is res.unknown) and (v == v): ds_values.append(v) - if ds_values: - avg = math.fsum(ds_values)/len(ds_values) + if n > 0: + nv = len(ds_values) + if float(nv) > (1 - self.UNKNOWN_CDP_RATIO) * (float(n - 1)): + res = math.fsum(ds_values) / nv + else: + raise NotEnoughData( + '%s: Too many unknown CDPs (>%d%%) in window [%s, -0s]' % ( + self.db.filename, int(100.0 * self.UNKNOWN_CDP_RATIO), start)) else: - avg = .0 - return avg + raise NoData( + '%s: No CDPs found in window [%s, -0s]' % ( + self.db.filename, start)) + return res diff --git a/alerts/lib/loggers.py b/alerts/lib/loggers.py index b968132..3076f30 100644 --- a/alerts/lib/loggers.py +++ b/alerts/lib/loggers.py @@ -1,7 +1,9 @@ +import os import logging import zope.interface import zope.schema from zope.schema import getValidationErrors +from socket import getfqdn from alerts import config from .interfaces import ICheckContext @@ -35,7 +37,14 @@ def __init__(self, recipients, mailer=None): def emit(self, record): msg = record.msg + try: + # A structured message ? + title, body = msg.title, msg.body + except AttributeError: + # A string-like message + title = u'%s: Checking %s' % (record.levelname.lower(), record.check_host) + body = record.getMessage() headers = { - 'Subject': unicode(msg.title), + 'Subject': unicode(title), } - self.mailer.send(self.recipients, headers, msg.body) + self.mailer.send(self.recipients, headers, body) diff --git a/alerts/templates/base.html b/alerts/templates/base.html new file mode 100644 index 0000000..eb5016d --- /dev/null +++ b/alerts/templates/base.html @@ -0,0 +1,19 @@ + + + + + + + + +
+ + + + + + diff --git a/alerts/templates/cpu.excessive-usage.html b/alerts/templates/cpu.excessive-usage.html index db39cf5..b70f7af 100644 --- a/alerts/templates/cpu.excessive-usage.html +++ b/alerts/templates/cpu.excessive-usage.html @@ -1,18 +1,16 @@ - - - - -