Skip to content

Commit

Permalink
�fix(infra): modify threshold of alerting rules for test (#57)
Browse files Browse the repository at this point in the history
* refactor(infra): make functions

* fix(infra): modify threshold of alerting rules for test

* fix(infra): resolve typo
  • Loading branch information
ssupecial authored May 30, 2024
1 parent d0e26de commit d0740b4
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 89 deletions.
153 changes: 68 additions & 85 deletions check-instance/check_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,112 +12,95 @@
# MS Teams Incoming Webhook URL
WEBHOOK_URL = os.environ["WEBHOOK_URL"]

# Prometheus queries
QUERIES = {
"api-client": 'count by (instance) (group(system_cpu_utilization{job="backend-client-metric"}))',
"api-admin": 'count by (instance) (group(system_cpu_utilization{job="backend-admin-metric"}))',
"iris": 'count by (instance) (group(cpu_usage_percent{job="iris-metric"}))',
}


def fetch_metrics(query):
response = requests.get(f"{PROMETHEUS_URL}/api/v1/query", params={"query": query})
if response.status_code == 200:
try:
response = requests.get(
f"{PROMETHEUS_URL}/api/v1/query", params={"query": query}
)
response.raise_for_status()
return response.json()
else:
except requests.RequestException as e:
print(f"Error: {e}")
return None


def get_instance_counts():
data = {}
for key, query in QUERIES.items():
result = fetch_metrics(query)
if result and result["data"]["result"]:
data[key] = int(result["data"]["result"][0]["value"][1])
return data


def read_instance_file(file_path):
if os.path.isfile(file_path):
with open(file_path, "r") as json_file:
return json.load(json_file)
else:
default_data = {"api-client": 1, "api-admin": 1, "iris": 1}
with open(file_path, "w") as json_file:
json.dump(default_data, json_file)
return default_data


def write_instance_file(file_path, data):
with open(file_path, "w") as json_file:
json.dump(data, json_file)


def create_alert_message(alerts):
message = ""
for alert in alerts:
delta = alert[1] - alert[2]
change = "증가하였습니다" if delta > 0 else "감소하였습니다"
message += f"{alert[0]} 인스턴스가 {abs(delta)}{change}: {alert[2]}개 -> {alert[1]}\n"
return message


def send_alert(message):
payload = {
"title": "인스턴스 개수 변경 알림",
"text": message.replace("\n", "<br>"),
}
try:
requests.post(WEBHOOK_URL, json=payload)
except requests.RequestException as e:
print(f"Failed to send alert: {e}")


if __name__ == "__main__":
print("Start check instance")

file_path = "instance.json"

if os.path.isfile(file_path):
print(f"{file_path} exists.")
else: # 인스턴스 개수를 기록하는 파일이 없으면 생성 (기본값: 1)
print(f"{file_path} does not exist.")
with open("instance.json", "w") as json_file:
data = {"api-client": 1, "api-admin": 1, "iris": 1}
json.dump(data, json_file)
before_data = read_instance_file(file_path)

while True:
# 데이터 수집
try:
backend_client_metric_query = 'count by (instance) (group(system_cpu_utilization{job="backend-client-metric"}))'
result_api_client = fetch_metrics(backend_client_metric_query)
backend_admin_metric_query = 'count by (instance) (group(system_cpu_utilization{job="backend-admin-metric"}))'
result_api_admin = fetch_metrics(backend_admin_metric_query)
iris_metric_query = (
'count by (instance) (group(cpu_usage_percent{job="iris-metric"}))'
)
result_iris = fetch_metrics(iris_metric_query)

data = {}

data = get_instance_counts()
alerts = []
print("현재 시간: ", datetime.datetime.now())

# Prometheus에서 데이터를 가져온 경우만
if result_api_client["data"]["result"]:
result = result_api_client["data"]["result"][0]["value"][1]
data["api-client"] = result
print("Client 인스턴스 개수: ", (result))

if result_api_admin["data"]["result"]:
result = result_api_admin["data"]["result"][0]["value"][1]
data["api-admin"] = result
print("Admin 인스턴스 개수: ", (result))

if result_iris["data"]["result"]:
result = result_iris["data"]["result"][0]["value"][1]
data["iris"] = result
print("Iris 인스턴스 개수: ", (result))

# 이전 데이터와 비교
with open("instance.json", "r") as json_file:
before_data = json.load(json_file)

# Prometheus에서 데이터를 가져온 경우 & 이전 데이터와 다른 경우
for key, value in data.items():
print(key)
if key == "api-client":
if data["api-client"] != before_data["api-client"]:
print("api-client 인스턴스 개수 변경")
alerts.append(
(
"Client API",
int(data["api-client"]),
int(before_data["api-client"]),
)
)
before_data["api-client"] = data["api-client"]
if key == "api-admin":
if data["api-admin"] != before_data["api-admin"]:
print("api-admin 인스턴스 개수 변경")
alerts.append(
(
"Admin API",
int(data["api-admin"]),
int(before_data["api-admin"]),
)
)
before_data["api-admin"] = data["api-admin"]
if key == "iris":
if data["iris"] != before_data["iris"]:
print("iris 인스턴스 개수 변경")
alerts.append(
("Iris", int(data["iris"]), int(before_data["iris"]))
)
before_data["iris"] = data["iris"]
if value != before_data[key]:
alerts.append((key, value, before_data[key]))
before_data[key] = value

if alerts:
with open("instance.json", "w") as json_file:
json.dump(before_data, json_file) # 변경된 인스턴스 개수 저장

message = ""
for alert in alerts:
if (alert[1] - alert[2]) > 0:
message += f"{alert[0]} 인스턴스가 {alert[1]-alert[2]}개 증가하였습니다: {alert[2]}개 -> {alert[1]}\n"
else:
message += f"{alert[0]} 인스턴스 {alert[2]-alert[1]}개 감소하였습니다: {alert[2]}개 -> {alert[1]}\n"
payload = {
"title": "인스턴스 개수 변경 알림",
"text": message,
}
requests.post(WEBHOOK_URL, json=payload)
write_instance_file(file_path, before_data)
message = create_alert_message(alerts)
send_alert(message)

except Exception as e:
print(f"Error: {e}")
Expand Down
8 changes: 4 additions & 4 deletions config/prometheus/rules/cpu_rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ groups:
- name: cpu_alerts_per_container
rules:
- alert: HighCpuUsageClientAPIWarning
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 5
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 0.01
for: 1m
labels:
severity: warning
Expand All @@ -12,7 +12,7 @@ groups:
value: '{{ $value | printf "%.2f" }}'

- alert: HighCpuUsageClientAPICritical
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 90
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 0.9
for: 1m
labels:
severity: 'critical'
Expand All @@ -23,7 +23,7 @@ groups:


- alert: HighCpuUsageAdminAPIWarning
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 80
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 0.8
for: 1m
labels:
severity: warning
Expand All @@ -33,7 +33,7 @@ groups:
value: '{{ $value | printf "%.2f" }}'

- alert: HighCpuUsageAdminAPICritical
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 90
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 0.9
for: 1m
labels:
severity: 'critical'
Expand Down

0 comments on commit d0740b4

Please sign in to comment.