Skip to content

Commit

Permalink
Add prometheus exporter for nova-bigvm
Browse files Browse the repository at this point in the history
Currently exposing the following metrics:

Counter nova_bigvm_host_errors{error, vc, host, rp}
Counter nova_bigvm_no_candidate_error{hv_size}
Gauge nova_bigvm_host_freeing_up{vc, host, rp}
Gauge nova_bigvm_free_hosts_count{}
  • Loading branch information
leust committed May 2, 2023
1 parent 4bb914d commit 254e045
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 5 deletions.
95 changes: 95 additions & 0 deletions nova/bigvm/exporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright 2022 SAP SE
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_log import log as logging
from prometheus_client import CollectorRegistry
from prometheus_client import Counter
from prometheus_client import Gauge
from prometheus_client import start_http_server

import nova.conf

CONF = nova.conf.CONF

LOG = logging.getLogger(__name__)

REGISTRY = CollectorRegistry(auto_describe=True)

ERROR_FREEING = 'freeing'


class _BigVmPrometheusMetrics:

def __init__(self, registry):
self.host_errors_counter = \
Counter('nova_bigvm_host_errors',
'Counts errors that happened while reconciling '
'a host. The "error" is a short code meaning: '
'freeing = Error while freeing up a host',
labelnames=['error', 'vc', 'host', 'rp'],
registry=registry)

self.no_candidate_error_counter = \
Counter('nova_bigvm_no_candidate_error',
'Counter that increments each time the '
'reconciliation loop cannot find a '
'resource-provider for freeing-up a host.',
labelnames=['hv_size'],
registry=registry)

self.host_freeing_up_gauge = \
Gauge('nova_bigvm_host_freeing_up',
'Gauge for each BigVM host that is currently '
'being freed up.',
labelnames=['vc', 'host', 'rp'],
registry=registry)

self.free_hosts_count_gauge = \
Gauge('nova_bigvm_free_hosts_count',
'The total amount of available BigVM hosts '
'in the region.',
registry=registry)

def bigvm_host_error(self, error, rp):
self.host_errors_counter.labels(
error, rp['vc'], rp['host'], rp['rp']['name']).inc()

def error_freeing(self, rp):
self.bigvm_host_error(ERROR_FREEING, rp)

def no_candidate_error(self, hv_size):
self.no_candidate_error_counter.labels(hv_size).inc()

def set_freeing_provider(self, rp):
self.host_freeing_up_gauge.labels(
rp['vc'], rp['host'], rp['rp']['name']).set(1)

def remove_freeing_provider(self, rp):
try:
self.host_freeing_up_gauge.remove(
rp['vc'], rp['host'], rp['rp']['name'])
except KeyError:
pass

def set_free_hosts_count(self, count):
self.free_hosts_count_gauge.set(count)


bigvm_metrics = _BigVmPrometheusMetrics(REGISTRY)


def start_bigvm_exporter():
port = CONF.bigvm_exporter_listen_port
start_http_server(port, registry=REGISTRY)
LOG.info("Started BigVM prometheus exporter on port %s", port)
25 changes: 20 additions & 5 deletions nova/bigvm/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from oslo_messaging import exceptions as oslo_exceptions
from oslo_service import periodic_task

from nova.bigvm.exporter import bigvm_metrics
import nova.conf
from nova import context as nova_context
from nova import exception
Expand Down Expand Up @@ -166,6 +167,7 @@ def _flatten(list_of_lists):
'max_used': CONF.bigvm_cluster_max_usage_percent,
'max_reserved':
CONF.bigvm_cluster_max_reservation_percent})
bigvm_metrics.no_candidate_error(hv_size)
continue

# filter out providers that are disabled for bigVMs
Expand All @@ -181,6 +183,7 @@ def _flatten(list_of_lists):
'host for hypervisor size %(hv_size)d, because '
'all providers with enough space are disabled.',
{'hv_size': hv_size})
bigvm_metrics.no_candidate_error(hv_size)
continue

candidates[hv_size] = (alloc_reqs, filtered_provider_summaries)
Expand All @@ -192,6 +195,7 @@ def _flatten(list_of_lists):
'up a host for hypervisor size %(hv_size)d in '
'%(vc)s.',
{'hv_size': hv_size, 'vc': vc})
bigvm_metrics.no_candidate_error(hv_size)
continue
alloc_reqs, provider_summaries = candidates[hv_size]

Expand All @@ -214,7 +218,7 @@ def _free_memory(p):
cm = vmware_providers[rp_uuid]['cell_mapping']
with nova_context.target_cell(context, cm) as cctxt:
if self._free_host_for_provider(cctxt, rp_uuid,
host):
host, vc):
break
except oslo_exceptions.MessagingTimeout as e:
# we don't know if the timeout happened after we started
Expand Down Expand Up @@ -647,6 +651,7 @@ def _get_missing_hv_sizes(self, context, vcenters,
"""
found_hv_sizes_per_vc = {vc: set() for vc in vcenters}

free_hosts = 0
for rp_uuid, rp in bigvm_providers.items():
host_rp_uuid = rp['host_rp_uuid']
hv_size = vmware_providers[host_rp_uuid]['hv_size']
Expand All @@ -662,16 +667,25 @@ def _get_missing_hv_sizes(self, context, vcenters,

if state == special_spawning.FREE_HOST_STATE_DONE:
self._add_resources_to_provider(context, rp_uuid, rp)
bigvm_metrics.remove_freeing_provider(rp)
free_hosts += 1
elif state == special_spawning.FREE_HOST_STATE_ERROR:
LOG.warning('Freeing a host for spawning failed on '
'%(host)s.',
{'host': rp['host']})
# do some cleanup, so another compute-node is used
found_hv_sizes_per_vc[rp['vc']].remove(hv_size)
bigvm_metrics.remove_freeing_provider(rp)
bigvm_metrics.error_freeing(rp)
self._clean_up_consumed_provider(context, rp_uuid, rp)
else:
LOG.info('Waiting for host on %(host)s to free up.',
{'host': rp['host']})
bigvm_metrics.set_freeing_provider(rp)
else:
free_hosts += 1

bigvm_metrics.set_free_hosts_count(free_hosts)

hv_sizes_per_vc = {
vc: set(rp['hv_size'] for rp in vmware_providers.values()
Expand Down Expand Up @@ -708,7 +722,7 @@ def _add_resources_to_provider(self, context, rp_uuid, rp):
'on %(host)s.',
{'host': rp['host']})

def _free_host_for_provider(self, context, rp_uuid, host):
def _free_host_for_provider(self, context, rp_uuid, host, vc):
"""Takes care of creating a child resource provider in placement to
"claim" a resource-provider/host for freeing up a host. Then calls the
driver to actually free up the host in the cluster.
Expand Down Expand Up @@ -781,16 +795,17 @@ def _free_host_for_provider(self, context, rp_uuid, host):

# find a host and let DRS free it up
state = self.special_spawn_rpc.free_host(context, host)

new_rp = {'host': host,
'vc': vc,
'rp': {'name': new_rp_name}}
if state == special_spawning.FREE_HOST_STATE_DONE:
# there were free resources available immediately
needs_cleanup = False
new_rp = {'host': host,
'rp': {'name': new_rp_name}}
self._add_resources_to_provider(context, new_rp_uuid, new_rp)
elif state == special_spawning.FREE_HOST_STATE_STARTED:
# it started working on it. we have to check back later
# if it's done
bigvm_metrics.set_freeing_provider(new_rp)
needs_cleanup = False
finally:
# clean up placement, if something went wrong
Expand Down
3 changes: 3 additions & 0 deletions nova/cmd/bigvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from oslo_reports import guru_meditation_report as gmr
from oslo_reports import opts as gmr_opts

from nova.bigvm.exporter import start_bigvm_exporter
import nova.conf
from nova import config
from nova import objects
Expand All @@ -39,6 +40,8 @@ def main():

gmr.TextGuruMeditation.setup_autorun(version, conf=CONF)

start_bigvm_exporter()

server = service.Service.create(binary='nova-bigvm')
service.serve(server)
service.wait()
6 changes: 6 additions & 0 deletions nova/conf/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,12 @@
Compare the values of conf.vmware.memory_reservation_cluster_hosts_max_fail and
conf.vmware.memory_reservation_max_ratio_fallback to see how much of total
memory is actually reservable.
"""),
cfg.IntOpt(
'bigvm_exporter_listen_port',
default=9847,
help="""
Port where the BigVM prometheus exporter to listen for HTTP requests.
"""),
]

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,4 @@ futurist>=1.8.0 # Apache-2.0
openstacksdk>=0.35.0 # Apache-2.0
dataclasses>=0.7;python_version=='3.6' # Apache 2.0 License
PyYAML>=5.1 # MIT
prometheus_client

0 comments on commit 254e045

Please sign in to comment.