Skip to content

Commit

Permalink
Add prometheus exporter for nova-bigvm
Browse files Browse the repository at this point in the history
Exposing the following metrics:

Counter nova_bigvm_host_errors{error, vc, host, rp}
Counter nova_bigvm_no_candidate_error{hv_size}
Gauge nova_bigvm_host_freeing_up{vc, host, rp}
Gauge nova_bigvm_free_hosts_count{hv_size}

Change-Id: I050eeb1036910c03428eaa8aad7e992f241f6f51
  • Loading branch information
leust committed Dec 13, 2023
1 parent e675ba5 commit be76d9d
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 6 deletions.
96 changes: 96 additions & 0 deletions nova/bigvm/exporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright 2022 SAP SE
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_log import log as logging
from prometheus_client import CollectorRegistry
from prometheus_client import Counter
from prometheus_client import Gauge
from prometheus_client import start_http_server

import nova.conf

CONF = nova.conf.CONF

LOG = logging.getLogger(__name__)

REGISTRY = CollectorRegistry(auto_describe=True)

ERROR_FREEING = 'freeing'


class _BigVmPrometheusMetrics:

def __init__(self, registry):
self.host_errors_counter = \
Counter('nova_bigvm_host_errors',
'Counts errors that happened while reconciling '
'a host. The "error" is a short code meaning: '
'freeing = Error while freeing up a host',
labelnames=['error', 'vc', 'host', 'rp'],
registry=registry)

self.no_candidate_error_counter = \
Counter('nova_bigvm_no_candidate_error',
'Counter that increments each time the '
'reconciliation loop cannot find a '
'resource-provider for freeing-up a host.',
labelnames=['hv_size'],
registry=registry)

self.host_freeing_up_gauge = \
Gauge('nova_bigvm_host_freeing_up',
'Gauge for each BigVM host that is currently '
'being freed up.',
labelnames=['vc', 'host', 'rp'],
registry=registry)

self.free_hosts_count_gauge = \
Gauge('nova_bigvm_free_hosts_count',
'The total amount of available BigVM hosts '
'in the region.',
labelnames=['hv_size'],
registry=registry)

def bigvm_host_error(self, error, rp):
self.host_errors_counter.labels(
error, rp['vc'], rp['host'], rp['rp']['name']).inc()

def error_freeing(self, rp):
self.bigvm_host_error(ERROR_FREEING, rp)

def no_candidate_error(self, hv_size):
self.no_candidate_error_counter.labels(hv_size).inc()

def set_freeing_provider(self, rp):
self.host_freeing_up_gauge.labels(
rp['vc'], rp['host'], rp['rp']['name']).set(1)

def remove_freeing_provider(self, rp):
try:
self.host_freeing_up_gauge.remove(
rp['vc'], rp['host'], rp['rp']['name'])
except KeyError:
pass

def set_free_hosts_count(self, hv_size, count):
self.free_hosts_count_gauge.labels(hv_size).set(count)


bigvm_metrics = _BigVmPrometheusMetrics(REGISTRY)


def start_bigvm_exporter():
port = CONF.bigvm_exporter_listen_port
start_http_server(port, registry=REGISTRY)
LOG.info("Started BigVM prometheus exporter on port %s", port)
29 changes: 23 additions & 6 deletions nova/bigvm/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""
BigVM service
"""
import collections
import itertools

import os_resource_classes as orc
Expand All @@ -23,6 +24,7 @@
from oslo_messaging import exceptions as oslo_exceptions
from oslo_service import periodic_task

from nova.bigvm.exporter import bigvm_metrics
import nova.conf
from nova import context as nova_context
from nova import exception
Expand Down Expand Up @@ -167,6 +169,7 @@ def _flatten(list_of_lists):
'max_used': CONF.bigvm_cluster_max_usage_percent,
'max_reserved':
CONF.bigvm_cluster_max_reservation_percent})
bigvm_metrics.no_candidate_error(hv_size)
continue

# filter out providers that are disabled in general or for bigVMs
Expand All @@ -184,6 +187,7 @@ def _flatten(list_of_lists):
'host for hypervisor size %(hv_size)d, because '
'all providers with enough space are disabled.',
{'hv_size': hv_size})
bigvm_metrics.no_candidate_error(hv_size)
continue

candidates[hv_size] = (alloc_reqs, filtered_provider_summaries)
Expand All @@ -195,6 +199,7 @@ def _flatten(list_of_lists):
'up a host for hypervisor size %(hv_size)d in '
'%(vc)s.',
{'hv_size': hv_size, 'vc': vc})
bigvm_metrics.no_candidate_error(hv_size)
continue
alloc_reqs, provider_summaries = candidates[hv_size]

Expand All @@ -217,7 +222,7 @@ def _free_memory(p):
cm = vmware_providers[rp_uuid]['cell_mapping']
with nova_context.target_cell(context, cm) as cctxt:
if self._free_host_for_provider(cctxt, rp_uuid,
host):
host, vc):
break
except oslo_exceptions.MessagingTimeout as e:
# we don't know if the timeout happened after we started
Expand Down Expand Up @@ -630,6 +635,7 @@ def _get_missing_hv_sizes(self, context, vcenters,
"""
found_hv_sizes_per_vc = {vc: set() for vc in vcenters}

free_hosts = collections.defaultdict(int)
for rp_uuid, rp in bigvm_providers.items():
host_rp_uuid = rp['host_rp_uuid']
hv_size = vmware_providers[host_rp_uuid]['hv_size']
Expand All @@ -645,16 +651,26 @@ def _get_missing_hv_sizes(self, context, vcenters,

if state == special_spawning.FREE_HOST_STATE_DONE:
self._add_resources_to_provider(context, rp_uuid, rp)
bigvm_metrics.remove_freeing_provider(rp)
free_hosts[hv_size] += 1
elif state == special_spawning.FREE_HOST_STATE_ERROR:
LOG.warning('Freeing a host for spawning failed on '
'%(host)s.',
{'host': rp['host']})
# do some cleanup, so another compute-node is used
found_hv_sizes_per_vc[rp['vc']].remove(hv_size)
bigvm_metrics.remove_freeing_provider(rp)
bigvm_metrics.error_freeing(rp)
self._clean_up_consumed_provider(context, rp_uuid, rp)
else:
LOG.info('Waiting for host on %(host)s to free up.',
{'host': rp['host']})
bigvm_metrics.set_freeing_provider(rp)
else:
free_hosts[hv_size] += 1

for hv_size, count in free_hosts.items():
bigvm_metrics.set_free_hosts_count(hv_size, count)

hv_sizes_per_vc = {
vc: set(rp['hv_size'] for rp in vmware_providers.values()
Expand Down Expand Up @@ -692,7 +708,7 @@ def _add_resources_to_provider(self, context, rp_uuid, rp):
'on %(host)s.',
{'host': rp['host']})

def _free_host_for_provider(self, context, rp_uuid, host):
def _free_host_for_provider(self, context, rp_uuid, host, vc):
"""Takes care of creating a child resource provider in placement to
"claim" a resource-provider/host for freeing up a host. Then calls the
driver to actually free up the host in the cluster.
Expand Down Expand Up @@ -764,17 +780,18 @@ def _free_host_for_provider(self, context, rp_uuid, host):

# find a host and let DRS free it up
state = self.special_spawn_rpc.free_host(context, host)

new_rp = {'host': host,
'vc': vc,
'rp': {'name': new_rp_name},
'host_rp_uuid': rp_uuid}
if state == special_spawning.FREE_HOST_STATE_DONE:
# there were free resources available immediately
needs_cleanup = False
new_rp = {'host': host,
'rp': {'name': new_rp_name},
'host_rp_uuid': rp_uuid}
self._add_resources_to_provider(context, new_rp_uuid, new_rp)
elif state == special_spawning.FREE_HOST_STATE_STARTED:
# it started working on it. we have to check back later
# if it's done
bigvm_metrics.set_freeing_provider(new_rp)
needs_cleanup = False
finally:
# clean up placement, if something went wrong
Expand Down
3 changes: 3 additions & 0 deletions nova/cmd/bigvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from oslo_reports import guru_meditation_report as gmr
from oslo_reports import opts as gmr_opts

from nova.bigvm.exporter import start_bigvm_exporter
import nova.conf
from nova import config
from nova import objects
Expand All @@ -39,6 +40,8 @@ def main():

gmr.TextGuruMeditation.setup_autorun(version, conf=CONF)

start_bigvm_exporter()

server = service.Service.create(binary='nova-bigvm')
service.serve(server)
service.wait()
6 changes: 6 additions & 0 deletions nova/conf/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,12 @@
Compare the values of conf.vmware.memory_reservation_cluster_hosts_max_fail and
conf.vmware.memory_reservation_max_ratio_fallback to see how much of total
memory is actually reservable.
"""),
cfg.IntOpt(
'bigvm_exporter_listen_port',
default=9847,
help="""
Port on which the BigVM prometheus exporter listens for HTTP requests.
"""),
]

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,4 @@ futurist>=1.8.0 # Apache-2.0
openstacksdk>=0.35.0 # Apache-2.0
dataclasses>=0.7;python_version=='3.6' # Apache 2.0 License
PyYAML>=5.1 # MIT
prometheus_client

0 comments on commit be76d9d

Please sign in to comment.