From d0740b42ca097f5b04772776015ad677ad012249 Mon Sep 17 00:00:00 2001 From: Eunsu Kang <56429615+ssupecial@users.noreply.github.com> Date: Thu, 30 May 2024 12:37:13 +0900 Subject: [PATCH] fix(infra): modify threshold of alerting rules for test (#57) * refactor(infra): make functions * fix(infra): modify threshold of alerting rules for test * fix(infra): resolve typo --- check-instance/check_instance.py | 153 ++++++++++++-------------- config/prometheus/rules/cpu_rules.yml | 8 +- 2 files changed, 72 insertions(+), 89 deletions(-) diff --git a/check-instance/check_instance.py b/check-instance/check_instance.py index e2178e9..ea1a99b 100644 --- a/check-instance/check_instance.py +++ b/check-instance/check_instance.py @@ -12,112 +12,95 @@ # MS Teams Incoming Webhook URL WEBHOOK_URL = os.environ["WEBHOOK_URL"] +# Prometheus queries +QUERIES = { + "api-client": 'count by (instance) (group(system_cpu_utilization{job="backend-client-metric"}))', + "api-admin": 'count by (instance) (group(system_cpu_utilization{job="backend-admin-metric"}))', + "iris": 'count by (instance) (group(cpu_usage_percent{job="iris-metric"}))', +} + def fetch_metrics(query): - response = requests.get(f"{PROMETHEUS_URL}/api/v1/query", params={"query": query}) - if response.status_code == 200: + try: + response = requests.get( + f"{PROMETHEUS_URL}/api/v1/query", params={"query": query} + ) + response.raise_for_status() return response.json() - else: + except requests.RequestException as e: + print(f"Error: {e}") return None +def get_instance_counts(): + data = {} + for key, query in QUERIES.items(): + result = fetch_metrics(query) + if result and result["data"]["result"]: + data[key] = int(result["data"]["result"][0]["value"][1]) + return data + + +def read_instance_file(file_path): + if os.path.isfile(file_path): + with open(file_path, "r") as json_file: + return json.load(json_file) + else: + default_data = {"api-client": 1, "api-admin": 1, "iris": 1} + with open(file_path, "w") as json_file: + json.dump(default_data, json_file) + return default_data + + +def write_instance_file(file_path, data): + with open(file_path, "w") as json_file: + json.dump(data, json_file) + + +def create_alert_message(alerts): + message = "" + for alert in alerts: + delta = alert[1] - alert[2] + change = "증가하였습니다" if delta > 0 else "감소하였습니다" + message += f"{alert[0]} 인스턴스가 {abs(delta)}개 {change}: {alert[2]}개 -> {alert[1]}개\n" + return message + + +def send_alert(message): + payload = { + "title": "인스턴스 개수 변경 알림", + "text": message.replace("\n", "
"), + } + try: + requests.post(WEBHOOK_URL, json=payload) + except requests.RequestException as e: + print(f"Failed to send alert: {e}") + + if __name__ == "__main__": print("Start check instance") file_path = "instance.json" - - if os.path.isfile(file_path): - print(f"{file_path} exists.") - else: # 인스턴스 개수를 기록하는 파일이 없으면 생성 (기본값: 1) - print(f"{file_path} does not exist.") - with open("instance.json", "w") as json_file: - data = {"api-client": 1, "api-admin": 1, "iris": 1} - json.dump(data, json_file) + before_data = read_instance_file(file_path) while True: # 데이터 수집 try: - backend_client_metric_query = 'count by (instance) (group(system_cpu_utilization{job="backend-client-metric"}))' - result_api_client = fetch_metrics(backend_client_metric_query) - backend_admin_metric_query = 'count by (instance) (group(system_cpu_utilization{job="backend-admin-metric"}))' - result_api_admin = fetch_metrics(backend_admin_metric_query) - iris_metric_query = ( - 'count by (instance) (group(cpu_usage_percent{job="iris-metric"}))' - ) - result_iris = fetch_metrics(iris_metric_query) - - data = {} + + data = get_instance_counts() alerts = [] print("현재 시간: ", datetime.datetime.now()) - # Prometheus에서 데이터를 가져온 경우만 - if result_api_client["data"]["result"]: - result = result_api_client["data"]["result"][0]["value"][1] - data["api-client"] = result - print("Client 인스턴스 개수: ", (result)) - - if result_api_admin["data"]["result"]: - result = result_api_admin["data"]["result"][0]["value"][1] - data["api-admin"] = result - print("Admin 인스턴스 개수: ", (result)) - - if result_iris["data"]["result"]: - result = result_iris["data"]["result"][0]["value"][1] - data["iris"] = result - print("Iris 인스턴스 개수: ", (result)) - - # 이전 데이터와 비교 - with open("instance.json", "r") as json_file: - before_data = json.load(json_file) - # Prometheus에서 데이터를 가져온 경우 & 이전 데이터와 다른 경우 for key, value in data.items(): - print(key) - if key == "api-client": - if data["api-client"] != before_data["api-client"]: - print("api-client 인스턴스 개수 변경") - alerts.append( - ( - "Client API", - int(data["api-client"]), - int(before_data["api-client"]), - ) - ) - before_data["api-client"] = data["api-client"] - if key == "api-admin": - if data["api-admin"] != before_data["api-admin"]: - print("api-admin 인스턴스 개수 변경") - alerts.append( - ( - "Admin API", - int(data["api-admin"]), - int(before_data["api-admin"]), - ) - ) - before_data["api-admin"] = data["api-admin"] - if key == "iris": - if data["iris"] != before_data["iris"]: - print("iris 인스턴스 개수 변경") - alerts.append( - ("Iris", int(data["iris"]), int(before_data["iris"])) - ) - before_data["iris"] = data["iris"] + if value != before_data[key]: + alerts.append((key, value, before_data[key])) + before_data[key] = value if alerts: - with open("instance.json", "w") as json_file: - json.dump(before_data, json_file) # 변경된 인스턴스 개수 저장 - - message = "" - for alert in alerts: - if (alert[1] - alert[2]) > 0: - message += f"{alert[0]} 인스턴스가 {alert[1]-alert[2]}개 증가하였습니다: {alert[2]}개 -> {alert[1]}개\n" - else: - message += f"{alert[0]} 인스턴스 {alert[2]-alert[1]}개 감소하였습니다: {alert[2]}개 -> {alert[1]}개\n" - payload = { - "title": "인스턴스 개수 변경 알림", - "text": message, - } - requests.post(WEBHOOK_URL, json=payload) + write_instance_file(file_path, before_data) + message = create_alert_message(alerts) + send_alert(message) except Exception as e: print(f"Error: {e}") diff --git a/config/prometheus/rules/cpu_rules.yml b/config/prometheus/rules/cpu_rules.yml index be351ec..f076fd9 100644 --- a/config/prometheus/rules/cpu_rules.yml +++ b/config/prometheus/rules/cpu_rules.yml @@ -2,7 +2,7 @@ groups: - name: cpu_alerts_per_container rules: - alert: HighCpuUsageClientAPIWarning - expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 5 + expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 0.01 for: 1m labels: severity: warning @@ -12,7 +12,7 @@ groups: value: '{{ $value | printf "%.2f" }}' - alert: HighCpuUsageClientAPICritical - expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 90 + expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 0.9 for: 1m labels: severity: 'critical' @@ -23,7 +23,7 @@ groups: - alert: HighCpuUsageAdminAPIWarning - expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 80 + expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 0.8 for: 1m labels: severity: warning @@ -33,7 +33,7 @@ groups: value: '{{ $value | printf "%.2f" }}' - alert: HighCpuUsageAdminAPICritical - expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 90 + expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 0.9 for: 1m labels: severity: 'critical'