Skip to content

Commit

Permalink
feat: add service discovery metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
xoxys committed Jun 15, 2021
1 parent 9d90c3b commit cbb62d7
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 21 deletions.
5 changes: 5 additions & 0 deletions docs/content/configuration/defaults.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ logging:
# supported log formats: console|json|simple
format: console

metrics:
enabled: true
address: "127.0.0.1"
port: 8000

output_file:
loop_delay: 300
# Run pve sd in a loop and discover hosts every n seconds (as defined in loop_delay).
Expand Down
9 changes: 9 additions & 0 deletions docs/content/configuration/env.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,26 @@ title: Environment Variables
<!-- spellchecker-disable -->
{{< highlight Shell "linenos=table" >}}
PROMETHEUS_PVE_SD_CONFIG_FILE=

# supported log levels: debug|info|warning|error|critical
PROMETHEUS_PVE_SD_LOG_LEVEL=warning
# supported log formats: console|json|simple
PROMETHEUS_PVE_SD_LOG_FORMAT=console

METRICS_ENABLED=true
METRICS_ADDRESS=127.0.01
METRICS_PORT=8000

PROMETHEUS_PVE_SD_OUTPUT_FILE=
PROMETHEUS_PVE_SD_LOOP_DELAY=300

# Run pve sd in a loop and discover hosts every n seconds (as defined in PROMETHEUS_PVE_SD_LOOP_DELAY).
# Can be disabled to run disovery only once.
PROMETHEUS_PVE_SD_SERVICE=true

PROMETHEUS_PVE_SD_EXCLUDE_STATE=
PROMETHEUS_PVE_SD_EXCLUDE_VMID=

PROMETHEUS_PVE_SD_PVE_SERVER=
PROMETHEUS_PVE_SD_PVE_USER=
PROMETHEUS_PVE_SD_PVE_PASSWORD=
Expand Down
32 changes: 16 additions & 16 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 19 additions & 5 deletions prometheuspvesd/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import tempfile
from time import sleep

from prometheus_client import start_http_server

import prometheuspvesd.exception
from prometheuspvesd import __version__
from prometheuspvesd.config import SingleConfig
Expand Down Expand Up @@ -119,12 +121,20 @@ def _get_config(self):
return config

def _fetch(self):
loop_delay = self.config.config["loop_delay"]
output_file = self.config.config["output_file"]

self.logger.info("Writes targets to {}".format(output_file))
self.logger.info("Writes targets to {}".format(self.config.config["output_file"]))
self.logger.debug("Propagate from PVE")

if self.config.config["service"] and self.config.config["metrics"]["enabled"]:
self.logger.info(
"Starting metrics http endpoint on port {}".format(
self.config.config["metrics"]["port"]
)
)
start_http_server(
self.config.config["metrics"]["port"],
addr=self.config.config["metrics"]["address"]
)

while True:
try:
inventory = self.discovery.propagate()
Expand All @@ -138,7 +148,11 @@ def _fetch(self):
if not self.config.config["service"]:
break

self.logger.info("Waiting {} seconds for next discovery loop".format(loop_delay))
self.logger.info(
"Waiting {} seconds for next discovery loop".format(
self.config.config["loop_delay"]
)
)
sleep(self.config.config["loop_delay"])

def _write(self, host_list: HostList):
Expand Down
15 changes: 15 additions & 0 deletions prometheuspvesd/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,21 @@ class Config():
"""

SETTINGS = {
"metrics.enabled": {
"default": True,
"env": "METRICS_ENABLED",
"type": environs.Env().bool
},
"metrics.address": {
"default": "127.0.0.1",
"env": "METRICS_ADDRESS",
"type": environs.Env().str
},
"metrics.port": {
"default": 8000,
"env": "METRICS_PORT",
"type": environs.Env().int
},
"config_file": {
"default": "",
"env": "CONFIG_FILE",
Expand Down
23 changes: 23 additions & 0 deletions prometheuspvesd/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from collections import defaultdict

import requests
from prometheus_client import Counter
from prometheus_client import Gauge
from prometheus_client import Summary

from prometheuspvesd.config import SingleConfig
from prometheuspvesd.exception import APIError
Expand All @@ -21,6 +24,15 @@
except ImportError:
HAS_PROXMOXER = False

PROPAGATION_TIME = Summary(
"pve_sd_propagate_seconds", "Time spent propagating the inventory from PVE"
)
HOST_GAUGE = Gauge("pve_sd_hosts", "Number of hosts discovered by PVE SD")
PVE_REQUEST_COUNT_TOTAL = Counter("pve_sd_requests_total", "Total count of requests to PVE API")
PVE_REQUEST_COUNT_ERROR_TOTAL = Counter(
"pve_sd_requests_error_total", "Total count of failed requests to PVE API"
)


class Discovery():
"""Prometheus PVE Service Discovery."""
Expand Down Expand Up @@ -48,6 +60,7 @@ def _auth(self):
timeout=self.config.config["pve"]["auth_timeout"]
)
except requests.RequestException as e:
PVE_REQUEST_COUNT_ERROR_TOTAL.inc()
raise APIError(str(e))

def _get_names(self, pve_list, pve_type):
Expand Down Expand Up @@ -89,6 +102,7 @@ def validate(address):
if pve_type == "qemu":
# If qemu agent is enabled, try to gather the IP address
try:
PVE_REQUEST_COUNT_TOTAL.inc()
if self.client.nodes(pve_node).get(pve_type, vmid, "agent", "info") is not None:
networks = self.client.nodes(pve_node).get(
"qemu", vmid, "agent", "network-get-interfaces"
Expand All @@ -104,6 +118,7 @@ def validate(address):

if not address:
try:
PVE_REQUEST_COUNT_TOTAL.inc()
config = self.client.nodes(pve_node).get(pve_type, vmid, "config")
sources = [config["net0"], config["ipconfig0"]]

Expand Down Expand Up @@ -133,20 +148,26 @@ def _exclude(self, pve_list):
filtered.append(item.copy())
return filtered

@PROPAGATION_TIME.time()
def propagate(self):
self.host_list.clear()

PVE_REQUEST_COUNT_TOTAL.inc()
for node in self._get_names(self.client.nodes.get(), "node"):
try:
PVE_REQUEST_COUNT_TOTAL.inc()
qemu_list = self._exclude(self.client.nodes(node).qemu.get())
PVE_REQUEST_COUNT_TOTAL.inc()
container_list = self._exclude(self.client.nodes(node).lxc.get())
except Exception as e: # noqa
PVE_REQUEST_COUNT_ERROR_TOTAL.inc()
raise APIError(str(e))

# Merge QEMU and Containers lists from this node
instances = self._get_variables(qemu_list, "qemu").copy()
instances.update(self._get_variables(container_list, "container"))

HOST_GAUGE.set(len(instances))
self.logger.info("Found {} targets".format(len(instances)))
for host in instances:
host_meta = instances[host]
Expand All @@ -157,13 +178,15 @@ def propagate(self):
except KeyError:
pve_type = "qemu"

PVE_REQUEST_COUNT_TOTAL.inc()
config = self.client.nodes(node).get(pve_type, vmid, "config")

try:
description = (config["description"])
except KeyError:
description = None
except Exception as e: # noqa
PVE_REQUEST_COUNT_ERROR_TOTAL.inc()
raise APIError(str(e))

try:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ python = "^3.6.0"
python-json-logger = "2.0.1"
requests = "2.25.1"
"ruamel.yaml" = "0.17.9"
prometheus-client = "^0.11.0"

[tool.poetry.dev-dependencies]
bandit = "1.7.0"
Expand Down

0 comments on commit cbb62d7

Please sign in to comment.