From 254e04509b84913ec7ee17f3ffbb291e3c52e4d0 Mon Sep 17 00:00:00 2001 From: Marius Leustean Date: Tue, 20 Dec 2022 09:18:40 +0200 Subject: [PATCH] Add prometheus exporter for nova-bigvm Currently exposing the following metrics: Counter nova_bigvm_host_errors{error, vc, host, rp} Counter nova_bigvm_no_candidate_error{hv_size} Gauge nova_bigvm_host_freeing_up{vc, host, rp} Gauge nova_bigvm_free_hosts_count{} --- nova/bigvm/exporter.py | 95 ++++++++++++++++++++++++++++++++++++++++++ nova/bigvm/manager.py | 25 ++++++++--- nova/cmd/bigvm.py | 3 ++ nova/conf/base.py | 6 +++ requirements.txt | 1 + 5 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 nova/bigvm/exporter.py diff --git a/nova/bigvm/exporter.py b/nova/bigvm/exporter.py new file mode 100644 index 00000000000..1d5a6ce0db7 --- /dev/null +++ b/nova/bigvm/exporter.py @@ -0,0 +1,95 @@ +# Copyright 2022 SAP SE +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +from oslo_log import log as logging +from prometheus_client import CollectorRegistry +from prometheus_client import Counter +from prometheus_client import Gauge +from prometheus_client import start_http_server + +import nova.conf + +CONF = nova.conf.CONF + +LOG = logging.getLogger(__name__) + +REGISTRY = CollectorRegistry(auto_describe=True) + +ERROR_FREEING = 'freeing' + + +class _BigVmPrometheusMetrics: + + def __init__(self, registry): + self.host_errors_counter = \ + Counter('nova_bigvm_host_errors', + 'Counts errors that happened while reconciling ' + 'a host. The "error" is a short code meaning: ' + 'freeing = Error while freeing up a host', + labelnames=['error', 'vc', 'host', 'rp'], + registry=registry) + + self.no_candidate_error_counter = \ + Counter('nova_bigvm_no_candidate_error', + 'Counter that increments each time the ' + 'reconciliation loop cannot find a ' + 'resource-provider for freeing-up a host.', + labelnames=['hv_size'], + registry=registry) + + self.host_freeing_up_gauge = \ + Gauge('nova_bigvm_host_freeing_up', + 'Gauge for each BigVM host that is currently ' + 'being freed up.', + labelnames=['vc', 'host', 'rp'], + registry=registry) + + self.free_hosts_count_gauge = \ + Gauge('nova_bigvm_free_hosts_count', + 'The total amount of available BigVM hosts ' + 'in the region.', + registry=registry) + + def bigvm_host_error(self, error, rp): + self.host_errors_counter.labels( + error, rp['vc'], rp['host'], rp['rp']['name']).inc() + + def error_freeing(self, rp): + self.bigvm_host_error(ERROR_FREEING, rp) + + def no_candidate_error(self, hv_size): + self.no_candidate_error_counter.labels(hv_size).inc() + + def set_freeing_provider(self, rp): + self.host_freeing_up_gauge.labels( + rp['vc'], rp['host'], rp['rp']['name']).set(1) + + def remove_freeing_provider(self, rp): + try: + self.host_freeing_up_gauge.remove( + rp['vc'], rp['host'], rp['rp']['name']) + except KeyError: + pass + + def set_free_hosts_count(self, count): + self.free_hosts_count_gauge.set(count) + + +bigvm_metrics = _BigVmPrometheusMetrics(REGISTRY) + + +def start_bigvm_exporter(): + port = CONF.bigvm_exporter_listen_port + start_http_server(port, registry=REGISTRY) + LOG.info("Started BigVM prometheus exporter on port %s", port) diff --git a/nova/bigvm/manager.py b/nova/bigvm/manager.py index 002d0467365..2f63a1359cd 100644 --- a/nova/bigvm/manager.py +++ b/nova/bigvm/manager.py @@ -22,6 +22,7 @@ from oslo_messaging import exceptions as oslo_exceptions from oslo_service import periodic_task +from nova.bigvm.exporter import bigvm_metrics import nova.conf from nova import context as nova_context from nova import exception @@ -166,6 +167,7 @@ def _flatten(list_of_lists): 'max_used': CONF.bigvm_cluster_max_usage_percent, 'max_reserved': CONF.bigvm_cluster_max_reservation_percent}) + bigvm_metrics.no_candidate_error(hv_size) continue # filter out providers that are disabled for bigVMs @@ -181,6 +183,7 @@ def _flatten(list_of_lists): 'host for hypervisor size %(hv_size)d, because ' 'all providers with enough space are disabled.', {'hv_size': hv_size}) + bigvm_metrics.no_candidate_error(hv_size) continue candidates[hv_size] = (alloc_reqs, filtered_provider_summaries) @@ -192,6 +195,7 @@ def _flatten(list_of_lists): 'up a host for hypervisor size %(hv_size)d in ' '%(vc)s.', {'hv_size': hv_size, 'vc': vc}) + bigvm_metrics.no_candidate_error(hv_size) continue alloc_reqs, provider_summaries = candidates[hv_size] @@ -214,7 +218,7 @@ def _free_memory(p): cm = vmware_providers[rp_uuid]['cell_mapping'] with nova_context.target_cell(context, cm) as cctxt: if self._free_host_for_provider(cctxt, rp_uuid, - host): + host, vc): break except oslo_exceptions.MessagingTimeout as e: # we don't know if the timeout happened after we started @@ -647,6 +651,7 @@ def _get_missing_hv_sizes(self, context, vcenters, """ found_hv_sizes_per_vc = {vc: set() for vc in vcenters} + free_hosts = 0 for rp_uuid, rp in bigvm_providers.items(): host_rp_uuid = rp['host_rp_uuid'] hv_size = vmware_providers[host_rp_uuid]['hv_size'] @@ -662,16 +667,25 @@ def _get_missing_hv_sizes(self, context, vcenters, if state == special_spawning.FREE_HOST_STATE_DONE: self._add_resources_to_provider(context, rp_uuid, rp) + bigvm_metrics.remove_freeing_provider(rp) + free_hosts += 1 elif state == special_spawning.FREE_HOST_STATE_ERROR: LOG.warning('Freeing a host for spawning failed on ' '%(host)s.', {'host': rp['host']}) # do some cleanup, so another compute-node is used found_hv_sizes_per_vc[rp['vc']].remove(hv_size) + bigvm_metrics.remove_freeing_provider(rp) + bigvm_metrics.error_freeing(rp) self._clean_up_consumed_provider(context, rp_uuid, rp) else: LOG.info('Waiting for host on %(host)s to free up.', {'host': rp['host']}) + bigvm_metrics.set_freeing_provider(rp) + else: + free_hosts += 1 + + bigvm_metrics.set_free_hosts_count(free_hosts) hv_sizes_per_vc = { vc: set(rp['hv_size'] for rp in vmware_providers.values() @@ -708,7 +722,7 @@ def _add_resources_to_provider(self, context, rp_uuid, rp): 'on %(host)s.', {'host': rp['host']}) - def _free_host_for_provider(self, context, rp_uuid, host): + def _free_host_for_provider(self, context, rp_uuid, host, vc): """Takes care of creating a child resource provider in placement to "claim" a resource-provider/host for freeing up a host. Then calls the driver to actually free up the host in the cluster. @@ -781,16 +795,17 @@ def _free_host_for_provider(self, context, rp_uuid, host): # find a host and let DRS free it up state = self.special_spawn_rpc.free_host(context, host) - + new_rp = {'host': host, + 'vc': vc, + 'rp': {'name': new_rp_name}} if state == special_spawning.FREE_HOST_STATE_DONE: # there were free resources available immediately needs_cleanup = False - new_rp = {'host': host, - 'rp': {'name': new_rp_name}} self._add_resources_to_provider(context, new_rp_uuid, new_rp) elif state == special_spawning.FREE_HOST_STATE_STARTED: # it started working on it. we have to check back later # if it's done + bigvm_metrics.set_freeing_provider(new_rp) needs_cleanup = False finally: # clean up placement, if something went wrong diff --git a/nova/cmd/bigvm.py b/nova/cmd/bigvm.py index 0d39516d4d8..1dca99f543f 100644 --- a/nova/cmd/bigvm.py +++ b/nova/cmd/bigvm.py @@ -21,6 +21,7 @@ from oslo_reports import guru_meditation_report as gmr from oslo_reports import opts as gmr_opts +from nova.bigvm.exporter import start_bigvm_exporter import nova.conf from nova import config from nova import objects @@ -39,6 +40,8 @@ def main(): gmr.TextGuruMeditation.setup_autorun(version, conf=CONF) + start_bigvm_exporter() + server = service.Service.create(binary='nova-bigvm') service.serve(server) service.wait() diff --git a/nova/conf/base.py b/nova/conf/base.py index 6e55bb98e5e..3f528ba432b 100644 --- a/nova/conf/base.py +++ b/nova/conf/base.py @@ -192,6 +192,12 @@ Compare the values of conf.vmware.memory_reservation_cluster_hosts_max_fail and conf.vmware.memory_reservation_max_ratio_fallback to see how much of total memory is actually reservable. +"""), + cfg.IntOpt( + 'bigvm_exporter_listen_port', + default=9847, + help=""" +Port where the BigVM prometheus exporter to listen for HTTP requests. """), ] diff --git a/requirements.txt b/requirements.txt index a8bed744fba..9fc4de5b846 100644 --- a/requirements.txt +++ b/requirements.txt @@ -68,3 +68,4 @@ futurist>=1.8.0 # Apache-2.0 openstacksdk>=0.35.0 # Apache-2.0 dataclasses>=0.7;python_version=='3.6' # Apache 2.0 License PyYAML>=5.1 # MIT +prometheus_client \ No newline at end of file