Skip to content

Commit

Permalink
Merge pull request #88 from naved001/gpu-query-fix
Browse files Browse the repository at this point in the history
  • Loading branch information
naved001 authored Nov 22, 2024
2 parents 201aa24 + 7d2f256 commit 3cb070d
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 7 deletions.
22 changes: 22 additions & 0 deletions openshift_metrics/metrics_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,25 @@ def _was_pod_stopped(current_time: int, previous_time: int, interval: int) -> bo
is more than the frequency of our metric collection
"""
return (current_time - previous_time) > interval

@staticmethod
def insert_node_labels(node_labels: list, resource_request_metrics: list) -> list:
"""Inserts node labels into resource_request_metrics"""
node_label_dict = {}
for node_label in node_labels:
node = node_label["metric"]["node"]
gpu = node_label["metric"].get("label_nvidia_com_gpu_product")
machine = node_label["metric"].get("label_nvidia_com_gpu_machine")
node_label_dict[node] = {"gpu": gpu, "machine": machine}
for pod in resource_request_metrics:
node = pod["metric"]["node"]
if node not in node_label_dict:
logger.warning("Could not find labels for node: %s", node)
continue
pod["metric"]["label_nvidia_com_gpu_product"] = node_label_dict[node].get(
"gpu"
)
pod["metric"]["label_nvidia_com_gpu_machine"] = node_label_dict[node].get(
"machine"
)
return resource_request_metrics
12 changes: 5 additions & 7 deletions openshift_metrics/openshift_prometheus_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,12 @@

from openshift_metrics import utils
from openshift_metrics.prometheus_client import PrometheusClient

from openshift_metrics.metrics_processor import MetricsProcessor

CPU_REQUEST = 'kube_pod_resource_request{unit="cores"} unless on(pod, namespace) kube_pod_status_unschedulable'
MEMORY_REQUEST = 'kube_pod_resource_request{unit="bytes"} unless on(pod, namespace) kube_pod_status_unschedulable'

# For GPU requests, we don't need to exclude unscheduled pods because the join on node will eliminate those as unscheduled
# pods don't have a node value
GPU_REQUEST = 'kube_pod_resource_request{resource=~"nvidia.com.*"} * on(node) group_left(label_nvidia_com_gpu_product, label_nvidia_com_gpu_machine) kube_node_labels'

GPU_REQUEST = 'kube_pod_resource_request{resource=~"nvidia.com.*"} unless on(pod, namespace) kube_pod_status_unschedulable'
KUBE_NODE_LABELS = 'kube_node_labels{label_nvidia_com_gpu_product!=""}'

def main():
"""This method kick starts the process of collecting and saving the metrics"""
Expand Down Expand Up @@ -98,7 +95,8 @@ def main():
gpu_request_metrics = prom_client.query_metric(
GPU_REQUEST, report_start_date, report_end_date
)
metrics_dict["gpu_metrics"] = gpu_request_metrics
node_labels = prom_client.query_metric(KUBE_NODE_LABELS, report_start_date, report_end_date)
metrics_dict["gpu_metrics"] = MetricsProcessor.insert_node_labels(node_labels, gpu_request_metrics)
except utils.EmptyResultError:
pass

Expand Down
82 changes: 82 additions & 0 deletions openshift_metrics/tests/test_metrics_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,3 +674,85 @@ def test_extract_gpu_info_no_info_anywhere(self):
gpu_info = processor._extract_gpu_info("gpu_request", metric_with_label)

assert gpu_info.gpu_type == metrics_processor.GPU_UNKNOWN_TYPE


class TestInsertNodeLabels(TestCase):
def test_insert_node_labels(self):
resource_request_metrics = [
{
"metric": {
"pod": "TestPodA",
"node": "wrk-1",
"namespace": "namespace1",
},
"values": [[1730939400, "4"], [1730940300, "4"], [1730941200, "4"]],
},
{
"metric": {
"pod": "TestPodB",
"node": "wrk-2",
"namespace": "namespace2",
},
"values": [[1730939400, "4"], [1730940300, "4"], [1730941200, "4"]],
},
{
"metric": {
"pod": "TestPodC",
"node": "wrk-3", # let's assume this node doesn't have any associated labels
"namespace": "namespace2",
},
"values": [[1730939400, "4"], [1730940300, "4"], [1730941200, "4"]],
},
]
kube_node_labels = [
{
"metric": {
"node": "wrk-1",
"label_nvidia_com_gpu_machine": "ThinkSystem-SD650-N-V2",
"label_nvidia_com_gpu_product": "NVIDIA-A100-SXM4-40GB",
},
"values": [[1730939400, "1"], [1730940300, "1"]],
},
{
"metric": {
"node": "wrk-2",
"label_nvidia_com_gpu_product": "Tesla-V100-PCIE-32GB",
"label_nvidia_com_gpu_machine": "PowerEdge-R740xd",
},
"values": [[1730939400, "1"], [1730940300, "1"]],
},
]
metrics_with_labels = metrics_processor.MetricsProcessor.insert_node_labels(
kube_node_labels, resource_request_metrics
)
expected_metrics = [
{
"metric": {
"pod": "TestPodA",
"node": "wrk-1",
"namespace": "namespace1",
"label_nvidia_com_gpu_machine": "ThinkSystem-SD650-N-V2",
"label_nvidia_com_gpu_product": "NVIDIA-A100-SXM4-40GB",
},
"values": [[1730939400, "4"], [1730940300, "4"], [1730941200, "4"]],
},
{
"metric": {
"pod": "TestPodB",
"node": "wrk-2",
"namespace": "namespace2",
"label_nvidia_com_gpu_product": "Tesla-V100-PCIE-32GB",
"label_nvidia_com_gpu_machine": "PowerEdge-R740xd",
},
"values": [[1730939400, "4"], [1730940300, "4"], [1730941200, "4"]],
},
{
"metric": {
"pod": "TestPodC",
"node": "wrk-3",
"namespace": "namespace2",
},
"values": [[1730939400, "4"], [1730940300, "4"], [1730941200, "4"]],
},
]
self.assertEqual(expected_metrics, metrics_with_labels)

0 comments on commit 3cb070d

Please sign in to comment.