Merge pull request #88 from naved001/gpu-query-fix

CCI-MOC · Nov 22, 2024 · 3cb070d · 3cb070d
2 parents 201aa24 + 7d2f256
commit 3cb070d
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 7 deletions.
diff --git a/openshift_metrics/metrics_processor.py b/openshift_metrics/metrics_processor.py
@@ -171,3 +171,25 @@ def _was_pod_stopped(current_time: int, previous_time: int, interval: int) -> bo
         is more than the frequency of our metric collection
         """
         return (current_time - previous_time) > interval
+
+    @staticmethod
+    def insert_node_labels(node_labels: list, resource_request_metrics: list) -> list:
+        """Inserts node labels into resource_request_metrics"""
+        node_label_dict = {}
+        for node_label in node_labels:
+            node = node_label["metric"]["node"]
+            gpu = node_label["metric"].get("label_nvidia_com_gpu_product")
+            machine = node_label["metric"].get("label_nvidia_com_gpu_machine")
+            node_label_dict[node] = {"gpu": gpu, "machine": machine}
+        for pod in resource_request_metrics:
+            node = pod["metric"]["node"]
+            if node not in node_label_dict:
+                logger.warning("Could not find labels for node: %s", node)
+                continue
+            pod["metric"]["label_nvidia_com_gpu_product"] = node_label_dict[node].get(
+                "gpu"
+            )
+            pod["metric"]["label_nvidia_com_gpu_machine"] = node_label_dict[node].get(
+                "machine"
+            )
+        return resource_request_metrics
diff --git a/openshift_metrics/openshift_prometheus_metrics.py b/openshift_metrics/openshift_prometheus_metrics.py
@@ -21,15 +21,12 @@
 
 from openshift_metrics import utils
 from openshift_metrics.prometheus_client import PrometheusClient
-
+from openshift_metrics.metrics_processor import MetricsProcessor
 
 CPU_REQUEST = 'kube_pod_resource_request{unit="cores"} unless on(pod, namespace) kube_pod_status_unschedulable'
 MEMORY_REQUEST = 'kube_pod_resource_request{unit="bytes"} unless on(pod, namespace) kube_pod_status_unschedulable'
-
-# For GPU requests, we don't need to exclude unscheduled pods because the join on node will eliminate those as unscheduled
-# pods don't have a node value
-GPU_REQUEST = 'kube_pod_resource_request{resource=~"nvidia.com.*"} * on(node) group_left(label_nvidia_com_gpu_product, label_nvidia_com_gpu_machine) kube_node_labels'
-
+GPU_REQUEST = 'kube_pod_resource_request{resource=~"nvidia.com.*"} unless on(pod, namespace) kube_pod_status_unschedulable'
+KUBE_NODE_LABELS = 'kube_node_labels{label_nvidia_com_gpu_product!=""}'
 
 def main():
     """This method kick starts the process of collecting and saving the metrics"""
@@ -98,7 +95,8 @@ def main():
         gpu_request_metrics = prom_client.query_metric(
             GPU_REQUEST, report_start_date, report_end_date
         )
-        metrics_dict["gpu_metrics"] = gpu_request_metrics
+        node_labels = prom_client.query_metric(KUBE_NODE_LABELS, report_start_date, report_end_date)
+        metrics_dict["gpu_metrics"] = MetricsProcessor.insert_node_labels(node_labels, gpu_request_metrics)
     except utils.EmptyResultError:
         pass
 

diff --git a/openshift_metrics/tests/test_metrics_processor.py b/openshift_metrics/tests/test_metrics_processor.py
@@ -674,3 +674,85 @@ def test_extract_gpu_info_no_info_anywhere(self):
             gpu_info = processor._extract_gpu_info("gpu_request", metric_with_label)
 
             assert gpu_info.gpu_type == metrics_processor.GPU_UNKNOWN_TYPE
+
+
+class TestInsertNodeLabels(TestCase):
+    def test_insert_node_labels(self):
+        resource_request_metrics = [
+            {
+                "metric": {
+                    "pod": "TestPodA",
+                    "node": "wrk-1",
+                    "namespace": "namespace1",
+                },
+                "values": [[1730939400, "4"], [1730940300, "4"], [1730941200, "4"]],
+            },
+            {
+                "metric": {
+                    "pod": "TestPodB",
+                    "node": "wrk-2",
+                    "namespace": "namespace2",
+                },
+                "values": [[1730939400, "4"], [1730940300, "4"], [1730941200, "4"]],
+            },
+            {
+                "metric": {
+                    "pod": "TestPodC",
+                    "node": "wrk-3",  # let's assume this node doesn't have any associated labels
+                    "namespace": "namespace2",
+                },
+                "values": [[1730939400, "4"], [1730940300, "4"], [1730941200, "4"]],
+            },
+        ]
+        kube_node_labels = [
+            {
+                "metric": {
+                    "node": "wrk-1",
+                    "label_nvidia_com_gpu_machine": "ThinkSystem-SD650-N-V2",
+                    "label_nvidia_com_gpu_product": "NVIDIA-A100-SXM4-40GB",
+                },
+                "values": [[1730939400, "1"], [1730940300, "1"]],
+            },
+            {
+                "metric": {
+                    "node": "wrk-2",
+                    "label_nvidia_com_gpu_product": "Tesla-V100-PCIE-32GB",
+                    "label_nvidia_com_gpu_machine": "PowerEdge-R740xd",
+                },
+                "values": [[1730939400, "1"], [1730940300, "1"]],
+            },
+        ]
+        metrics_with_labels = metrics_processor.MetricsProcessor.insert_node_labels(
+            kube_node_labels, resource_request_metrics
+        )
+        expected_metrics = [
+            {
+                "metric": {
+                    "pod": "TestPodA",
+                    "node": "wrk-1",
+                    "namespace": "namespace1",
+                    "label_nvidia_com_gpu_machine": "ThinkSystem-SD650-N-V2",
+                    "label_nvidia_com_gpu_product": "NVIDIA-A100-SXM4-40GB",
+                },
+                "values": [[1730939400, "4"], [1730940300, "4"], [1730941200, "4"]],
+            },
+            {
+                "metric": {
+                    "pod": "TestPodB",
+                    "node": "wrk-2",
+                    "namespace": "namespace2",
+                    "label_nvidia_com_gpu_product": "Tesla-V100-PCIE-32GB",
+                    "label_nvidia_com_gpu_machine": "PowerEdge-R740xd",
+                },
+                "values": [[1730939400, "4"], [1730940300, "4"], [1730941200, "4"]],
+            },
+            {
+                "metric": {
+                    "pod": "TestPodC",
+                    "node": "wrk-3",
+                    "namespace": "namespace2",
+                },
+                "values": [[1730939400, "4"], [1730940300, "4"], [1730941200, "4"]],
+            },
+        ]
+        self.assertEqual(expected_metrics, metrics_with_labels)