diff --git a/NEWS b/NEWS index 151e0d60..1be3d09c 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,6 @@ 1.5.0 + * #245: Monitor backups with PagerDuty hook integration. See the documentation for more + information: https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#pagerduty-hook * #255: Add per-action hooks: "before_prune", "after_prune", "before_check", and "after_check". * #274: Add ~/.config/borgmatic.d as another configuration directory default. * #277: Customize Healthchecks log level via borgmatic "--monitoring-verbosity" flag. diff --git a/README.md b/README.md index 16dc0eaa..b2ba6e0a 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ borgmatic is powered by [Borg Backup](https://www.borgbackup.org/). Healthchecks      Cronitor      Cronhub      +PagerDuty      rsync.net      BorgBase      diff --git a/borgmatic/config/schema.yaml b/borgmatic/config/schema.yaml index 3a09fdf5..a228e7a3 100644 --- a/borgmatic/config/schema.yaml +++ b/borgmatic/config/schema.yaml @@ -567,6 +567,15 @@ map: for details. example: https://cronitor.link/d3x0c1 + pagerduty: + type: str + desc: | + PagerDuty integration key used to notify PagerDuty when a backup errors. Create + an account at https://www.pagerduty.com/ if you'd like to use this service. See + https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#pagerduty-hook + for details. + example: + a177cad45bd374409f78906a810a3074 cronhub: type: str desc: | diff --git a/borgmatic/hooks/dispatch.py b/borgmatic/hooks/dispatch.py index 206b0d1c..6c05cad9 100644 --- a/borgmatic/hooks/dispatch.py +++ b/borgmatic/hooks/dispatch.py @@ -1,6 +1,6 @@ import logging -from borgmatic.hooks import cronhub, cronitor, healthchecks, mysql, postgresql +from borgmatic.hooks import cronhub, cronitor, healthchecks, mysql, pagerduty, postgresql logger = logging.getLogger(__name__) @@ -8,6 +8,7 @@ 'healthchecks': healthchecks, 'cronitor': cronitor, 'cronhub': cronhub, + 'pagerduty': pagerduty, 'postgresql_databases': postgresql, 'mysql_databases': mysql, } diff --git a/borgmatic/hooks/monitor.py b/borgmatic/hooks/monitor.py index aee2b8f5..c4cf576e 100644 --- a/borgmatic/hooks/monitor.py +++ b/borgmatic/hooks/monitor.py @@ -1,6 +1,6 @@ from enum import Enum -MONITOR_HOOK_NAMES = ('healthchecks', 'cronitor', 'cronhub') +MONITOR_HOOK_NAMES = ('healthchecks', 'cronitor', 'cronhub', 'pagerduty') class State(Enum): diff --git a/borgmatic/hooks/pagerduty.py b/borgmatic/hooks/pagerduty.py new file mode 100644 index 00000000..0e613cc5 --- /dev/null +++ b/borgmatic/hooks/pagerduty.py @@ -0,0 +1,62 @@ +import datetime +import json +import logging +import platform + +import requests + +from borgmatic.hooks import monitor + +logger = logging.getLogger(__name__) + +EVENTS_API_URL = 'https://events.pagerduty.com/v2/enqueue' + + +def ping_monitor(integration_key, config_filename, state, monitoring_log_level, dry_run): + ''' + If this is an error state, create a PagerDuty event with the given integration key. Use the + given configuration filename in any log entries. If this is a dry run, then don't actually + create an event. + ''' + if state != monitor.State.FAIL: + logger.debug( + '{}: Ignoring unsupported monitoring {} in PagerDuty hook'.format( + config_filename, state.name.lower() + ) + ) + return + + dry_run_label = ' (dry run; not actually sending)' if dry_run else '' + logger.info('{}: Sending failure event to PagerDuty {}'.format(config_filename, dry_run_label)) + + if dry_run: + return + + hostname = platform.node() + local_timestamp = ( + datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).astimezone().isoformat() + ) + payload = json.dumps( + { + 'routing_key': integration_key, + 'event_action': 'trigger', + 'payload': { + 'summary': 'backup failed on {}'.format(hostname), + 'severity': 'error', + 'source': hostname, + 'timestamp': local_timestamp, + 'component': 'borgmatic', + 'group': 'backups', + 'class': 'backup failure', + 'custom_details': { + 'hostname': hostname, + 'configuration filename': config_filename, + 'server time': local_timestamp, + }, + }, + } + ) + logger.debug('{}: Using PagerDuty payload: {}'.format(config_filename, payload)) + + logging.getLogger('urllib3').setLevel(logging.ERROR) + requests.post(EVENTS_API_URL, data=payload.encode('utf-8')) diff --git a/docs/how-to/monitor-your-backups.md b/docs/how-to/monitor-your-backups.md index c56a1513..064c4080 100644 --- a/docs/how-to/monitor-your-backups.md +++ b/docs/how-to/monitor-your-backups.md @@ -28,14 +28,15 @@ hooks](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#error-hoo below for how to configure this. 4. **borgmatic monitoring hooks**: This feature integrates with monitoring services like [Healthchecks](https://healthchecks.io/), -[Cronitor](https://cronitor.io), and [Cronhub](https://cronhub.io), and pings -these services whenever borgmatic runs. That way, you'll receive an alert when -something goes wrong or the service doesn't hear from borgmatic for a -configured interval. See -[Healthchecks +[Cronitor](https://cronitor.io), [Cronhub](https://cronhub.io), and +[PagerDuty](https://www.pagerduty.com/) and pings these services whenever +borgmatic runs. That way, you'll receive an alert when something goes wrong or +(for certain hooks) the service doesn't hear from borgmatic for a configured +interval. See [Healthchecks hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#healthchecks-hook), [Cronitor -hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronitor-hook), and [Cronhub -hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronhub-hook) +hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronitor-hook), [Cronhub +hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronhub-hook), and +[PagerDuty hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#pagerduty-hook) below for how to configure this. 3. **Third-party monitoring software**: You can use traditional monitoring software to consume borgmatic JSON output and track when the last @@ -200,6 +201,32 @@ mechanisms](https://docs.cronhub.io/integrations.html) when backups fail or it doesn't hear from borgmatic for a certain period of time. +## PagerDuty hook + +[PagerDuty](https://cronhub.io/) provides incident monitoring and alerting, +and borgmatic has built-in integration with it. Once you create a PagerDuty +account and service +on their site, all you need to do is configure borgmatic with the unique +"Integration Key" for your service. Here's an example: + + +```yaml +hooks: + pagerduty: a177cad45bd374409f78906a810a3074 +``` + +With this hook in place, borgmatic creates a PagerDuty event for your service +whenever backups fail. Specifically, if an error occurs during a `create`, +`prune`, or `check` action, borgmatic sends an event to PagerDuty after the +`on_error` hooks run. Note that borgmatic does not contact PagerDuty when a +backup starts or ends without error. + +You can configure PagerDuty to notify you by a [variety of +mechanisms](https://support.pagerduty.com/docs/notifications) when backups +fail. + + ## Scripting borgmatic To consume the output of borgmatic in other software, you can include an diff --git a/docs/static/pagerduty.png b/docs/static/pagerduty.png new file mode 100644 index 00000000..c60c63ec Binary files /dev/null and b/docs/static/pagerduty.png differ diff --git a/tests/unit/hooks/test_pagerduty.py b/tests/unit/hooks/test_pagerduty.py new file mode 100644 index 00000000..76c5451a --- /dev/null +++ b/tests/unit/hooks/test_pagerduty.py @@ -0,0 +1,35 @@ +from flexmock import flexmock + +from borgmatic.hooks import pagerduty as module + + +def test_ping_monitor_ignores_start_state(): + flexmock(module.requests).should_receive('post').never() + + module.ping_monitor( + 'abc123', 'config.yaml', module.monitor.State.START, monitoring_log_level=1, dry_run=False + ) + + +def test_ping_monitor_ignores_finish_state(): + flexmock(module.requests).should_receive('post').never() + + module.ping_monitor( + 'abc123', 'config.yaml', module.monitor.State.FINISH, monitoring_log_level=1, dry_run=False + ) + + +def test_ping_monitor_calls_api_for_fail_state(): + flexmock(module.requests).should_receive('post') + + module.ping_monitor( + 'abc123', 'config.yaml', module.monitor.State.FAIL, monitoring_log_level=1, dry_run=False + ) + + +def test_ping_monitor_dry_run_does_not_call_api(): + flexmock(module.requests).should_receive('post').never() + + module.ping_monitor( + 'abc123', 'config.yaml', module.monitor.State.FAIL, monitoring_log_level=1, dry_run=True + )