Skip to content

Commit

Permalink
[FEAT] 모델 서빙 환경, 웹 서버 및 모델 퍼포먼스 모니터링 구축 (#10)
Browse files Browse the repository at this point in the history
* Feat  Prometeous, Grafana DockerFile

-
- #4

* Feat : Proemetous & Grafana & alertmanager docker compose yaml

-
- #4

* Feat : alertmanger.yml & prometheus.yml add file
-
- #4

* Feat : node_exporter.yaml add

-
- #4

---------

Co-authored-by: 김태양_T6044 <[email protected]>
  • Loading branch information
internationalwe and taeyang916 authored Mar 13, 2024
1 parent 4c93d4f commit 3b86564
Show file tree
Hide file tree
Showing 12 changed files with 454 additions and 4 deletions.
11 changes: 9 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,13 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

grafana.db
000000*
lock
queries.active
tombstones
prometheus/prometheus-volume/data/*
index
0000*
*.onnx
*.pth
*.pth
4 changes: 2 additions & 2 deletions .gitmessage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# <타입> : <제목> 의 형식으로 제목을 아래 공백줄에 작성
# 타입의 첫 문자는 대문자 / 제목은 50자 이내 / 변경사항이 "무엇"인지 명확히 작성 / 끝에 마침표 금지
# 예) Feat : Add login

Feat : alertmanger.yml & prometheus.yml add file
# 바로 아래 공백은 지우지 마세요 (제목과 본문의 분리를 위함)

################
Expand All @@ -12,7 +12,7 @@
################
# 꼬릿말(footer)을 아랫줄에 작성 (현재 커밋과 관련된 이슈 번호 추가 등)
# 예) #7
-
- #4
################
# Feat : 새로운 기능 추가
# Fix : 버그 수정
Expand Down
172 changes: 172 additions & 0 deletions docker/monitoring/alertmanager/config/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
# global: ## default 값 지정해주기
# resolve_timeout: 1m # 이 시간이 지나도록 경고가 업데이트되지 않은 경우 해결되었다고 선언 | default : 5m
# # 옵션 - email 관련 기본값설정
# # 옵션 - slack 관련 기본값

# # 템플릿이 있을 경우 템플릿 위치 = alertmanager.yml 위치에서 /templates 안에 만듬
# templates:
# - '/etc/alertmanager/templates/*.tmpl'

# # route는 루우팅 트리와 그 하위 트리의 노드를 정의한다.
# # 모든 alert는 최상위 경로에서 라우팅 트리를 들어가며, 하위 노드를 가로지르게 되고, conitnue 가 거짓일 경우 멈추고, 아닌 경우 계속 하위트리의 노드에 일치되는지 검색하게 된다.
# route:
# group_by: ['emailAlert'] # prometheus.yml의 targets -labels의 label_name을 의미합니다.
# group_wait: 10s # # inhibit 조건 발생에 대한 중복을 방지하기 위한 알림 발생 전 대기시간 | default = 30s
# group_interval: 300s # 새 알림에 대한 통지를 보내기 전 대기할 시간 | default = 5m
# repeat_interval: 1h # 이미 있는 경우 알림을 다시 보내기전에 대기할 시간 | default = 4h
# receiver: 'email' # 알림 설정 대상

# # routes: # 라우트 대상 설정
# # - match:
# # alertname: 'test-alram' # alertname이 일치하는 경우 알림 발송
# # receiver: 'slack-channel'

# receivers:
# - name: 'email'
# email_configs:
# - to: '[email protected], [email protected]'
# from: '[email protected]'
# smarthost: 'email host'
# auth_username: 'login username'
# auth_identity: 'auth idenetity'
# auth_password: 'password'
# send_resolved: true # alert 가 해결됬을 시 알림이 보내지는 설정 | default = false
# # headers:
# # subject: "Custom Warning: {{ .CommonLabels.job }} container Down"
# # html: '{{ template "email_template.html" . }}'
# inhibit_rules:
# - source_match:
# severity: 'critical'
# target_match: # 음소거가 되어야 할 대상
# severity: 'warning'
# equal: ['alertname', 'name'] # value값과 동일한 값을 갖는 경우 inhibit rule 적용

# - name: slack-channel
# slack_configs:
# - channel: #monitoring
# icon_url: https://avatars3.githubusercontent.com/u/3380462
# send_resolved: true
# title: '{{ template "custom_title" . }}'
# text: '{{ template "custom_slack_message" . }}'
global:
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'localhost:25'
smtp_from: '[email protected]'
smtp_auth_username: 'alertmanager'
smtp_auth_password: 'password'

# The directory from which notification templates are read.
templates:
- '/etc/alertmanager/template/*.tmpl'

# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
#
# To aggregate by all possible labels use '...' as the sole label name.
# This effectively disables aggregation entirely, passing through all
# alerts as-is. This is unlikely to be what you want, unless you have
# a very low alert volume or your upstream notification system performs
# its own grouping. Example: group_by: [...]
group_by: ['alertname', 'cluster', 'service']

# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s

# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m

# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 3h

# A default receiver
receiver: team-X-mails

# All the above attributes are inherited by all child routes and can
# overwritten on each.

# The child route trees.
routes:
# This routes performs a regular expression match on alert labels to
# catch alerts that are related to a list of services.
- matchers:
- service=~"foo1|foo2|baz"
receiver: team-X-mails
# The service has a sub-route for critical alerts, any alerts
# that do not match, i.e. severity != critical, fall-back to the
# parent node and are sent to 'team-X-mails'
routes:
- matchers:
- severity="critical"
receiver: team-X-pager
- matchers:
- service="files"
receiver: team-Y-mails

routes:
- matchers:
- severity="critical"
receiver: team-Y-pager

# This route handles all alerts coming from a database service. If there's
# no team to handle it, it defaults to the DB team.
- matchers:
- service="database"
receiver: team-DB-pager
# Also group alerts by affected database.
group_by: [alertname, cluster, database]
routes:
- matchers:
- owner="team-X"
receiver: team-X-pager
continue: true
- matchers:
- owner="team-Y"
receiver: team-Y-pager


# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_matchers: [severity="critical"]
target_matchers: [severity="warning"]
# Apply inhibition if the alertname is the same.
# CAUTION:
# If all label names listed in `equal` are missing
# from both the source and target alerts,
# the inhibition rule will apply!
equal: [alertname, cluster, service]


receivers:
- name: 'team-X-mails'
email_configs:
- to: '[email protected]'

- name: 'team-X-pager'
email_configs:
- to: '[email protected]'
pagerduty_configs:
- service_key: <team-X-key>

- name: 'team-Y-mails'
email_configs:
- to: '[email protected]'

- name: 'team-Y-pager'
pagerduty_configs:
- service_key: <team-Y-key>

- name: 'team-DB-pager'
pagerduty_configs:
- service_key: <team-DB-key>
47 changes: 47 additions & 0 deletions docker/monitoring/docker-compose_monitoring.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
version: '3.7'

services:
prometheus:
image: prom/prometheus
container_name: prometheus
volumes:
- ./prometheus/config/:/etc/prometheus/
- ./prometheus/prometheus-volume:/prometheus
ports:
- 9090:9090
command:
- "--web.enable-lifecycle"
- '--config.file=/etc/prometheus/prometheus.yml'
restart: always
networks:
- promnet
user: root

grafana:
image: grafana/grafana
container_name: grafana
volumes:
- ./grafana-volume:/var/lib/grafana
restart: always
networks:
- promnet
ports:
- 3300:3000
user: root
alertmanager:
image: prom/alertmanager
container_name: alertmanager
user: root
ports:
- 9093:9093
volumes:
- ./alertmanager/config/:/etc/alertmanager/
networks:
- promnet
restart: always
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'

networks:
promnet:
driver: bridge
12 changes: 12 additions & 0 deletions docker/monitoring/docker-compose_node_exporter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: '3.7'
services:
node:
image: prom/node-exporter
container_name: node-exporter
ports:
- 9100:9100
networks:
- promnet
networks:
promnet:
driver: bridge
55 changes: 55 additions & 0 deletions docker/monitoring/prometheus/config/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# default 값 설정하기 - 여기 부분은 전부 설정 안해줘도 상관없음
global:
scrape_interval: 15s # scrap target의 기본 interval을 15초로 변경 / default = 1m
scrape_timeout: 15s # scrap request 가 timeout 나는 길이 / default = 10s
evaluation_interval: 2m # rule 을 얼마나 빈번하게 검증하는지 / default = 1m

# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'codelab-monitor' # 기본적으로 붙여줄 라벨
# query_log_file: 로그가저장될파일주소.log # prometheus의 쿼리 로그들을 기록, 없으면 기록안함

# 규칙을 로딩하고 'evaluation_interval' 설정에 따라 정기적으로 평가한다.
rule_files:
- "rule.yml" # 파일위치는 prometheus.yml 이 있는 곳과 동일 위치

# 매트릭을 수집할 엔드포인드로 여기선 Prometheus 서버 자신을 가리킨다.
# scrape_configs:
# # 이 설정에서 수집한 타임시리즈에 `job=<job_name>`으로 잡의 이름을 설정한다.
# # metrics_path의 기본 경로는 '/metrics'이고 scheme의 기본값은 `http`다
# - job_name: 'monitoring-item' # job_name 은 모든 scrap 내에서 고유해야함
# scrape_interval: 10s # global에서 default 값을 정의해주었기 떄문에 안써도됨
# scrape_timeout: 10s # global에서 default 값을 정의해주었기 떄문에 안써도됨
# metrics_path: '/metrics' # 옵션 - prometheus 가 metrics를 얻기위해 참조하는 uri 를 변경할 수 있음 | default = /metrics
# honor_labels: false # 옵션 - 라벨 충동이 있을경우 라벨을 변경할지설정(false일 경우 라벨 안바뀜) | default = false
# honor_timestamps: false # 옵션 - honor_labels 이 참일 경우, metrics timestamp가 노출됨(true 일경우) | default = false
# scheme: 'http' # 옵션 - request 를 보낼 scheme 설정 | default = http
# params: # 옵션 - request 요청 보낼 떄의 param
# user-id: ['[email protected]']
# static_configs:
# - targets: ['localhost:9100']
scrape_configs:
- job_name: 'node_exporter'
metrics_path: /metrics
static_configs:
- targets: ['192.168.0.80:9100']
# 그 외에도 authorization 설정
# service discovery 설정(sd)

# 실제 scrap 하는 타겟에 관한 설정
# static_configs:
# - targets: ['192.168.0.80:3300', 'localhost:9100', 'localhost:80'] // prometheus, node-exporter, cadvisor
# labels: # 옵션 - scrap 해서 가져올 metrics 들 전부에게 붙여줄 라벨
# service : 'monitor-1'

# relabel_config - 스크랩되기 전의 label들을 수정
# metric_relabel_configs - 가져오는 대상들의 레이블들을 동적으로 다시작성하는 설정(drop, replace, labeldrop)


# # Alerting specifies settings related to the Alertmanager.
# alerting:
# alert_relabel_configs:
# [ - <relabel_config> ... ]
# alertmanagers:
# [ - <alertmanager_config> ... ]
21 changes: 21 additions & 0 deletions docker/monitoring/prometheus/config/rule.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
groups:
- name: example # 파일 내에서 unique 해야함
rules:

# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."

# Alert for any instance that has a median request latency >1s.
- alert: APIHighRequestLatency
expr: api_http_request_latencies_second{quantile="0.5"} > 1
for: 10m
annotations:
summary: "High request latency on {{ $labels.instance }}"
description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"ulid": "01HR56HBDPPW8C97WH2DP5EKGF",
"minTime": 1709539243307,
"maxTime": 1709553600000,
"stats": {
"numSamples": 1410133,
"numSeries": 1494,
"numChunks": 11917
},
"compaction": {
"level": 2,
"sources": [
"01HR4HZ2TH3SAFBKD7DGA2XY12",
"01HR4RSWT42NASZE0XSXF5G73E"
],
"parents": [
{
"ulid": "01HR4HZ2TH3SAFBKD7DGA2XY12",
"minTime": 1709539243307,
"maxTime": 1709546400000
},
{
"ulid": "01HR4RSWT42NASZE0XSXF5G73E",
"minTime": 1709546401007,
"maxTime": 1709553600000
}
]
},
"version": 1
}
Loading

0 comments on commit 3b86564

Please sign in to comment.