Add prometheus exporter for nova-bigvm

Currently exposing the following metrics: Counter nova_bigvm_host_errors{error, vc, host, rp} Counter nova_bigvm_no_candidate_error{hv_size} Gauge nova_bigvm_host_freeing_up{vc, host, rp} Gauge nova_bigvm_free_hosts_count{}
sapcc · May 2, 2023 · 254e045 · 254e045
1 parent 4bb914d
commit 254e045
Show file tree

Hide file tree

Showing 5 changed files with 125 additions and 5 deletions.
diff --git a/nova/bigvm/exporter.py b/nova/bigvm/exporter.py
@@ -0,0 +1,95 @@
+# Copyright 2022 SAP SE
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+from oslo_log import log as logging
+from prometheus_client import CollectorRegistry
+from prometheus_client import Counter
+from prometheus_client import Gauge
+from prometheus_client import start_http_server
+
+import nova.conf
+
+CONF = nova.conf.CONF
+
+LOG = logging.getLogger(__name__)
+
+REGISTRY = CollectorRegistry(auto_describe=True)
+
+ERROR_FREEING = 'freeing'
+
+
+class _BigVmPrometheusMetrics:
+
+    def __init__(self, registry):
+        self.host_errors_counter = \
+            Counter('nova_bigvm_host_errors',
+                    'Counts errors that happened while reconciling '
+                    'a host. The "error" is a short code meaning: '
+                    'freeing = Error while freeing up a host',
+                    labelnames=['error', 'vc', 'host', 'rp'],
+                    registry=registry)
+
+        self.no_candidate_error_counter = \
+            Counter('nova_bigvm_no_candidate_error',
+                    'Counter that increments each time the '
+                    'reconciliation loop cannot find a '
+                    'resource-provider for freeing-up a host.',
+                    labelnames=['hv_size'],
+                    registry=registry)
+
+        self.host_freeing_up_gauge = \
+            Gauge('nova_bigvm_host_freeing_up',
+                  'Gauge for each BigVM host that is currently '
+                  'being freed up.',
+                  labelnames=['vc', 'host', 'rp'],
+                  registry=registry)
+
+        self.free_hosts_count_gauge = \
+            Gauge('nova_bigvm_free_hosts_count',
+                  'The total amount of available BigVM hosts '
+                  'in the region.',
+                  registry=registry)
+
+    def bigvm_host_error(self, error, rp):
+        self.host_errors_counter.labels(
+            error, rp['vc'], rp['host'], rp['rp']['name']).inc()
+
+    def error_freeing(self, rp):
+        self.bigvm_host_error(ERROR_FREEING, rp)
+
+    def no_candidate_error(self, hv_size):
+        self.no_candidate_error_counter.labels(hv_size).inc()
+
+    def set_freeing_provider(self, rp):
+        self.host_freeing_up_gauge.labels(
+            rp['vc'], rp['host'], rp['rp']['name']).set(1)
+
+    def remove_freeing_provider(self, rp):
+        try:
+            self.host_freeing_up_gauge.remove(
+                rp['vc'], rp['host'], rp['rp']['name'])
+        except KeyError:
+            pass
+
+    def set_free_hosts_count(self, count):
+        self.free_hosts_count_gauge.set(count)
+
+
+bigvm_metrics = _BigVmPrometheusMetrics(REGISTRY)
+
+
+def start_bigvm_exporter():
+    port = CONF.bigvm_exporter_listen_port
+    start_http_server(port, registry=REGISTRY)
+    LOG.info("Started BigVM prometheus exporter on port %s", port)
diff --git a/nova/bigvm/manager.py b/nova/bigvm/manager.py
@@ -22,6 +22,7 @@
 from oslo_messaging import exceptions as oslo_exceptions
 from oslo_service import periodic_task
 
+from nova.bigvm.exporter import bigvm_metrics
 import nova.conf
 from nova import context as nova_context
 from nova import exception
@@ -166,6 +167,7 @@ def _flatten(list_of_lists):
                              'max_used': CONF.bigvm_cluster_max_usage_percent,
                              'max_reserved':
                                 CONF.bigvm_cluster_max_reservation_percent})
+                bigvm_metrics.no_candidate_error(hv_size)
                 continue
 
             # filter out providers that are disabled for bigVMs
@@ -181,6 +183,7 @@ def _flatten(list_of_lists):
                             'host for hypervisor size %(hv_size)d, because '
                             'all providers with enough space are disabled.',
                             {'hv_size': hv_size})
+                bigvm_metrics.no_candidate_error(hv_size)
                 continue
 
             candidates[hv_size] = (alloc_reqs, filtered_provider_summaries)
@@ -192,6 +195,7 @@ def _flatten(list_of_lists):
                                 'up a host for hypervisor size %(hv_size)d in '
                                 '%(vc)s.',
                                 {'hv_size': hv_size, 'vc': vc})
+                    bigvm_metrics.no_candidate_error(hv_size)
                     continue
                 alloc_reqs, provider_summaries = candidates[hv_size]
 
@@ -214,7 +218,7 @@ def _free_memory(p):
                         cm = vmware_providers[rp_uuid]['cell_mapping']
                         with nova_context.target_cell(context, cm) as cctxt:
                             if self._free_host_for_provider(cctxt, rp_uuid,
-                                                            host):
+                                                            host, vc):
                                 break
                 except oslo_exceptions.MessagingTimeout as e:
                     # we don't know if the timeout happened after we started
@@ -647,6 +651,7 @@ def _get_missing_hv_sizes(self, context, vcenters,
         """
         found_hv_sizes_per_vc = {vc: set() for vc in vcenters}
 
+        free_hosts = 0
         for rp_uuid, rp in bigvm_providers.items():
             host_rp_uuid = rp['host_rp_uuid']
             hv_size = vmware_providers[host_rp_uuid]['hv_size']
@@ -662,16 +667,25 @@ def _get_missing_hv_sizes(self, context, vcenters,
 
                 if state == special_spawning.FREE_HOST_STATE_DONE:
                     self._add_resources_to_provider(context, rp_uuid, rp)
+                    bigvm_metrics.remove_freeing_provider(rp)
+                    free_hosts += 1
                 elif state == special_spawning.FREE_HOST_STATE_ERROR:
                     LOG.warning('Freeing a host for spawning failed on '
                                 '%(host)s.',
                                 {'host': rp['host']})
                     # do some cleanup, so another compute-node is used
                     found_hv_sizes_per_vc[rp['vc']].remove(hv_size)
+                    bigvm_metrics.remove_freeing_provider(rp)
+                    bigvm_metrics.error_freeing(rp)
                     self._clean_up_consumed_provider(context, rp_uuid, rp)
                 else:
                     LOG.info('Waiting for host on %(host)s to free up.',
                              {'host': rp['host']})
+                    bigvm_metrics.set_freeing_provider(rp)
+            else:
+                free_hosts += 1
+
+        bigvm_metrics.set_free_hosts_count(free_hosts)
 
         hv_sizes_per_vc = {
             vc: set(rp['hv_size'] for rp in vmware_providers.values()
@@ -708,7 +722,7 @@ def _add_resources_to_provider(self, context, rp_uuid, rp):
                      'on %(host)s.',
                      {'host': rp['host']})
 
-    def _free_host_for_provider(self, context, rp_uuid, host):
+    def _free_host_for_provider(self, context, rp_uuid, host, vc):
         """Takes care of creating a child resource provider in placement to
         "claim" a resource-provider/host for freeing up a host. Then calls the
         driver to actually free up the host in the cluster.
@@ -781,16 +795,17 @@ def _free_host_for_provider(self, context, rp_uuid, host):
 
             # find a host and let DRS free it up
             state = self.special_spawn_rpc.free_host(context, host)
-
+            new_rp = {'host': host,
+                      'vc': vc,
+                      'rp': {'name': new_rp_name}}
             if state == special_spawning.FREE_HOST_STATE_DONE:
                 # there were free resources available immediately
                 needs_cleanup = False
-                new_rp = {'host': host,
-                          'rp': {'name': new_rp_name}}
                 self._add_resources_to_provider(context, new_rp_uuid, new_rp)
             elif state == special_spawning.FREE_HOST_STATE_STARTED:
                 # it started working on it. we have to check back later
                 # if it's done
+                bigvm_metrics.set_freeing_provider(new_rp)
                 needs_cleanup = False
         finally:
             # clean up placement, if something went wrong

diff --git a/nova/cmd/bigvm.py b/nova/cmd/bigvm.py
@@ -21,6 +21,7 @@
 from oslo_reports import guru_meditation_report as gmr
 from oslo_reports import opts as gmr_opts
 
+from nova.bigvm.exporter import start_bigvm_exporter
 import nova.conf
 from nova import config
 from nova import objects
@@ -39,6 +40,8 @@ def main():
 
     gmr.TextGuruMeditation.setup_autorun(version, conf=CONF)
 
+    start_bigvm_exporter()
+
     server = service.Service.create(binary='nova-bigvm')
     service.serve(server)
     service.wait()
diff --git a/nova/conf/base.py b/nova/conf/base.py
@@ -192,6 +192,12 @@
 Compare the values of conf.vmware.memory_reservation_cluster_hosts_max_fail and
 conf.vmware.memory_reservation_max_ratio_fallback to see how much of total
 memory is actually reservable.
+"""),
+    cfg.IntOpt(
+        'bigvm_exporter_listen_port',
+        default=9847,
+        help="""
+Port where the BigVM prometheus exporter to listen for HTTP requests.
 """),
 ]
 

diff --git a/requirements.txt b/requirements.txt
@@ -68,3 +68,4 @@ futurist>=1.8.0 # Apache-2.0
 openstacksdk>=0.35.0 # Apache-2.0
 dataclasses>=0.7;python_version=='3.6'  # Apache 2.0 License
 PyYAML>=5.1 # MIT
+prometheus_client