From 08ee40434d0f6ec8c4762cc8691403a2fc0214af Mon Sep 17 00:00:00 2001 From: moshemorad Date: Thu, 16 Jan 2025 20:44:05 +0200 Subject: [PATCH] Add retries (#394) --- poetry.lock | 19 +++++++++++++++++-- pyproject.toml | 1 + requirements.txt | 1 + .../integrations/prometheus/metrics/base.py | 2 ++ .../prometheus_metrics_service.py | 12 +++++++++--- 5 files changed, 30 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4ba3defa..84ba74a9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "about-time" @@ -1745,6 +1745,21 @@ files = [ [package.extras] optional = ["SQLAlchemy (>=1.4,<3)", "aiodns (>1.0)", "aiohttp (>=3.7.3,<4)", "boto3 (<=2)", "websocket-client (>=1,<2)", "websockets (>=10,<11)", "websockets (>=9.1,<10)"] +[[package]] +name = "tenacity" +version = "9.0.0" +description = "Retry code until it succeeds" +optional = false +python-versions = ">=3.8" +files = [ + {file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"}, + {file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"}, +] + +[package.extras] +doc = ["reno", "sphinx"] +test = ["pytest", "tornado (>=4.5)", "typeguard"] + [[package]] name = "tomli" version = "2.0.1" @@ -1919,4 +1934,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<=3.12.3" -content-hash = "ad3dbd10365d7b7557e62bc639a1d58545814f1b451633c6f1aa35f4d6451b62" +content-hash = "b9a190e2dd3b9b093aa679c3529b1edd4fb42e70fbb663e0efaded64b4d51a3f" diff --git a/pyproject.toml b/pyproject.toml index 22b852fc..6cff2281 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ idna = "3.7" urllib3 = "^1.26.20" setuptools = "^70.0.0" zipp = "^3.19.1" +tenacity = "^9.0.0" diff --git a/requirements.txt b/requirements.txt index 165f962d..246f68ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -53,3 +53,4 @@ tzlocal==5.2 ; python_version >= "3.9" and python_full_version < "3.13" urllib3==1.26.19 ; python_version >= "3.9" and python_full_version < "3.13" websocket-client==1.7.0 ; python_version >= "3.9" and python_full_version < "3.13" zipp==3.19.2 ; python_version >= "3.9" and python_version < "3.13" +tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13" \ No newline at end of file diff --git a/robusta_krr/core/integrations/prometheus/metrics/base.py b/robusta_krr/core/integrations/prometheus/metrics/base.py index 4169b0f0..347e6b93 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/base.py +++ b/robusta_krr/core/integrations/prometheus/metrics/base.py @@ -11,6 +11,7 @@ import numpy as np import pydantic as pd from prometrix import CustomPrometheusConnect +from tenacity import retry, stop_after_attempt, wait_random from robusta_krr.core.abstract.metrics import BaseMetric from robusta_krr.core.abstract.strategies import PodsTimeData @@ -116,6 +117,7 @@ def _step_to_string(self, step: datetime.timedelta) -> str: return f"{int(step.total_seconds()) // (60 * 60 * 24)}d" return f"{int(step.total_seconds()) // 60}m" + @retry(wait=wait_random(min=2, max=10), stop=stop_after_attempt(5)) def _query_prometheus_sync(self, data: PrometheusMetricData) -> list[PrometheusSeries]: if data.type == QueryType.QueryRange: response = self.prometheus.safe_custom_query_range( diff --git a/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py b/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py index 71d5ae8f..68f6bb80 100644 --- a/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py +++ b/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py @@ -8,6 +8,7 @@ from kubernetes.client import ApiClient from prometheus_api_client import PrometheusApiClientException from prometrix import PrometheusNotFound, get_custom_prometheus_connect +from tenacity import retry, stop_after_attempt, wait_random from robusta_krr.core.abstract.strategies import PodsTimeData from robusta_krr.core.integrations import openshift @@ -114,6 +115,7 @@ def check_connection(self): """ self.prometheus.check_prometheus_connection() + @retry(wait=wait_random(min=2, max=10), stop=stop_after_attempt(5)) async def query(self, query: str) -> dict: loop = asyncio.get_running_loop() return await loop.run_in_executor( @@ -121,6 +123,7 @@ async def query(self, query: str) -> dict: lambda: self.prometheus.safe_custom_query(query=query)["result"], ) + @retry(wait=wait_random(min=2, max=10), stop=stop_after_attempt(5)) async def query_range(self, query: str, start: datetime, end: datetime, step: timedelta) -> dict: loop = asyncio.get_running_loop() return await loop.run_in_executor( @@ -190,9 +193,12 @@ async def gather_data( ResourceHistoryData: The gathered resource history data. """ logger.debug(f"Gathering {LoaderClass.__name__} metric for {object}") - - metric_loader = LoaderClass(self.prometheus, self.name(), self.executor) - data = await metric_loader.load_data(object, period, step) + try: + metric_loader = LoaderClass(self.prometheus, self.name(), self.executor) + data = await metric_loader.load_data(object, period, step) + except Exception: + logger.exception("Failed to gather resource history data for %s", object) + data = {} if len(data) == 0: if "CPU" in LoaderClass.__name__: