Jsonnet: fix KEDA autoscaling metric errors during rollouts (#10013)

Signed-off-by: Yuri Nikolic <[email protected]>
grafana · Nov 25, 2024 · ca89adb · ca89adb
1 parent 2edcccd
commit ca89adb
Show file tree

Hide file tree

Showing 7 changed files with 2 additions and 1,066 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -545,7 +545,7 @@
 * [ENHANCEMENT] Add `_config.autoscaling_querier_predictive_scaling_enabled` to scale querier based on inflight queries 7 days ago. #7775
 * [ENHANCEMENT] Add support to autoscale ruler-querier replicas based on in-flight queries too (in addition to CPU and memory based scaling). #8060 #8188
 * [ENHANCEMENT] Distributor: improved distributor HPA scaling metric to only take in account ready pods. This requires the metric `kube_pod_status_ready` to be available in the data source used by KEDA to query scaling metrics (configured via `_config.autoscaling_prometheus_url`). #8251
-* [BUGFIX] Guard against missing samples in KEDA queries. #7691
+* [BUGFIX] Guard against missing samples in KEDA queries. #7691 #10013
 * [BUGFIX] Alertmanager: Set -server.http-idle-timeout to avoid EOF errors in ruler. #8192
 
 ### Mimirtool

diff --git a/operations/mimir-tests/test-autoscaling-custom-target-utilization-generated.yaml b/operations/mimir-tests/test-autoscaling-custom-target-utilization-generated.yaml
@@ -1974,14 +1974,6 @@ spec:
             max by (pod) (up{container="alertmanager",namespace="default"}) > 0
           )[15m:]
         ) * 1000
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_cpu_usage_seconds_total{container="alertmanager",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "1780"
     name: cortex_alertmanager_cpu_hpa_default
@@ -2008,14 +2000,6 @@ spec:
           max by (pod) (kube_pod_container_status_last_terminated_reason{container="alertmanager", namespace="default", reason="OOMKilled"})
           or vector(0)
         )
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_memory_working_set_bytes{container="alertmanager",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "9556302233"
     name: cortex_alertmanager_memory_hpa_default
@@ -2062,14 +2046,6 @@ spec:
             max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0
           )[15m:]
         ) * 1000
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "1780"
     name: cortex_distributor_cpu_hpa_default
@@ -2096,14 +2072,6 @@ spec:
           max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"})
           or vector(0)
         )
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_memory_working_set_bytes{container="distributor",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "3058016714"
     name: cortex_distributor_memory_hpa_default
@@ -2193,14 +2161,6 @@ spec:
             max by (pod) (up{container="query-frontend",namespace="default"}) > 0
           )[15m:]
         ) * 1000
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_cpu_usage_seconds_total{container="query-frontend",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "2225"
     name: query_frontend_cpu_hpa_default
@@ -2227,14 +2187,6 @@ spec:
           max by (pod) (kube_pod_container_status_last_terminated_reason{container="query-frontend", namespace="default", reason="OOMKilled"})
           or vector(0)
         )
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_memory_working_set_bytes{container="query-frontend",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "559939584"
     name: query_frontend_memory_hpa_default
@@ -2271,14 +2223,6 @@ spec:
             max by (pod) (up{container="ruler",namespace="default"}) > 0
           )[15m:]
         ) * 1000
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_cpu_usage_seconds_total{container="ruler",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "890"
     name: ruler_cpu_hpa_default
@@ -2305,14 +2249,6 @@ spec:
           max by (pod) (kube_pod_container_status_last_terminated_reason{container="ruler", namespace="default", reason="OOMKilled"})
           or vector(0)
         )
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_memory_working_set_bytes{container="ruler",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "5733781340"
     name: ruler_memory_hpa_default
@@ -2349,14 +2285,6 @@ spec:
             max by (pod) (up{container="ruler-querier",namespace="default"}) > 0
           )[15m:]
         ) * 1000
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_cpu_usage_seconds_total{container="ruler-querier",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "178"
     name: ruler_querier_cpu_hpa_default
@@ -2383,14 +2311,6 @@ spec:
           max by (pod) (kube_pod_container_status_last_terminated_reason{container="ruler-querier", namespace="default", reason="OOMKilled"})
           or vector(0)
         )
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_memory_working_set_bytes{container="ruler-querier",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "955630223"
     name: ruler_querier_memory_hpa_default
@@ -2435,14 +2355,6 @@ spec:
             max by (pod) (up{container="ruler-query-frontend",namespace="default"}) > 0
           )[15m:]
         ) * 1000
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_cpu_usage_seconds_total{container="ruler-query-frontend",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "1780"
     name: ruler_query_frontend_cpu_hpa_default
@@ -2469,14 +2381,6 @@ spec:
           max by (pod) (kube_pod_container_status_last_terminated_reason{container="ruler-query-frontend", namespace="default", reason="OOMKilled"})
           or vector(0)
         )
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_memory_working_set_bytes{container="ruler-query-frontend",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "559939584"
     name: ruler_query_frontend_memory_hpa_default

diff --git a/operations/mimir-tests/test-autoscaling-generated.yaml b/operations/mimir-tests/test-autoscaling-generated.yaml
@@ -1974,14 +1974,6 @@ spec:
             max by (pod) (up{container="alertmanager",namespace="default"}) > 0
           )[15m:]
         ) * 1000
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_cpu_usage_seconds_total{container="alertmanager",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "2000"
     name: cortex_alertmanager_cpu_hpa_default
@@ -2008,14 +2000,6 @@ spec:
           max by (pod) (kube_pod_container_status_last_terminated_reason{container="alertmanager", namespace="default", reason="OOMKilled"})
           or vector(0)
         )
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_memory_working_set_bytes{container="alertmanager",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "10737418240"
     name: cortex_alertmanager_memory_hpa_default
@@ -2062,14 +2046,6 @@ spec:
             max by (pod) (min_over_time(kube_pod_status_ready{namespace="default",condition="true"}[1m])) > 0
           )[15m:]
         ) * 1000
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "2000"
     name: cortex_distributor_cpu_hpa_default
@@ -2096,14 +2072,6 @@ spec:
           max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"})
           or vector(0)
         )
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_memory_working_set_bytes{container="distributor",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "3435973836"
     name: cortex_distributor_memory_hpa_default
@@ -2193,14 +2161,6 @@ spec:
             max by (pod) (up{container="query-frontend",namespace="default"}) > 0
           )[15m:]
         ) * 1000
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_cpu_usage_seconds_total{container="query-frontend",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "1875"
     name: query_frontend_cpu_hpa_default
@@ -2227,14 +2187,6 @@ spec:
           max by (pod) (kube_pod_container_status_last_terminated_reason{container="query-frontend", namespace="default", reason="OOMKilled"})
           or vector(0)
         )
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_memory_working_set_bytes{container="query-frontend",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "629145600"
     name: query_frontend_memory_hpa_default
@@ -2271,14 +2223,6 @@ spec:
             max by (pod) (up{container="ruler",namespace="default"}) > 0
           )[15m:]
         ) * 1000
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_cpu_usage_seconds_total{container="ruler",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "1000"
     name: ruler_cpu_hpa_default
@@ -2305,14 +2249,6 @@ spec:
           max by (pod) (kube_pod_container_status_last_terminated_reason{container="ruler", namespace="default", reason="OOMKilled"})
           or vector(0)
         )
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_memory_working_set_bytes{container="ruler",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "6442450944"
     name: ruler_memory_hpa_default
@@ -2349,14 +2285,6 @@ spec:
             max by (pod) (up{container="ruler-querier",namespace="default"}) > 0
           )[15m:]
         ) * 1000
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_cpu_usage_seconds_total{container="ruler-querier",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "200"
     name: ruler_querier_cpu_hpa_default
@@ -2383,14 +2311,6 @@ spec:
           max by (pod) (kube_pod_container_status_last_terminated_reason{container="ruler-querier", namespace="default", reason="OOMKilled"})
           or vector(0)
         )
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_memory_working_set_bytes{container="ruler-querier",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "1073741824"
     name: ruler_querier_memory_hpa_default
@@ -2435,14 +2355,6 @@ spec:
             max by (pod) (up{container="ruler-query-frontend",namespace="default"}) > 0
           )[15m:]
         ) * 1000
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_cpu_usage_seconds_total{container="ruler-query-frontend",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "2000"
     name: ruler_query_frontend_cpu_hpa_default
@@ -2469,14 +2381,6 @@ spec:
           max by (pod) (kube_pod_container_status_last_terminated_reason{container="ruler-query-frontend", namespace="default", reason="OOMKilled"})
           or vector(0)
         )
-        and
-        count (
-          count_over_time(
-            present_over_time(
-              container_memory_working_set_bytes{container="ruler-query-frontend",namespace="default"}[1m]
-            )[15m:1m]
-          ) >= 15
-        )
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "629145600"
     name: ruler_query_frontend_memory_hpa_default