[FEAT] 모델 서빙 환경, 웹 서버 및 모델 퍼포먼스 모니터링 구축 (#10)

* Feat Prometeous, Grafana DockerFile - - #4 * Feat : Proemetous & Grafana & alertmanager docker compose yaml - - #4 * Feat : alertmanger.yml & prometheus.yml add file - - #4 * Feat : node_exporter.yaml add - - #4 --------- Co-authored-by: 김태양_T6044 <[email protected]>
Make-Zenerator · Mar 13, 2024 · 3b86564 · 3b86564
1 parent 4c93d4f
commit 3b86564
Show file tree

Hide file tree

Showing 12 changed files with 454 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -158,6 +158,13 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-
+grafana.db
+000000*
+lock
+queries.active
+tombstones
+prometheus/prometheus-volume/data/*
+index
+0000*
 *.onnx
-*.pth
+*.pth
diff --git a/.gitmessage.txt b/.gitmessage.txt
@@ -2,7 +2,7 @@
 # <타입> : <제목> 의 형식으로 제목을 아래 공백줄에 작성
 # 타입의 첫 문자는 대문자 / 제목은 50자 이내 / 변경사항이 "무엇"인지 명확히 작성 / 끝에 마침표 금지
 # 예) Feat : Add login
-
+Feat : alertmanger.yml & prometheus.yml add file
 # 바로 아래 공백은 지우지 마세요 (제목과 본문의 분리를 위함)
 
 ################
@@ -12,7 +12,7 @@
 ################
 # 꼬릿말(footer)을 아랫줄에 작성 (현재 커밋과 관련된 이슈 번호 추가 등)
 # 예) #7
-- 
+- #4
 ################
 # Feat : 새로운 기능 추가
 # Fix : 버그 수정

diff --git a/docker/monitoring/alertmanager/config/alertmanager.yml b/docker/monitoring/alertmanager/config/alertmanager.yml
@@ -0,0 +1,172 @@
+# global: ## default 값 지정해주기
+#   resolve_timeout: 1m # 이 시간이 지나도록 경고가 업데이트되지 않은 경우 해결되었다고 선언 | default : 5m
+#   # 옵션 - email 관련 기본값설정
+#   # 옵션 - slack 관련 기본값
+
+# # 템플릿이 있을 경우 템플릿 위치 = alertmanager.yml 위치에서 /templates 안에 만듬
+# templates:
+# - '/etc/alertmanager/templates/*.tmpl'
+
+# # route는 루우팅 트리와 그 하위 트리의 노드를 정의한다.
+# # 모든 alert는 최상위 경로에서 라우팅 트리를 들어가며, 하위 노드를 가로지르게 되고, conitnue 가 거짓일 경우 멈추고, 아닌 경우 계속 하위트리의 노드에 일치되는지 검색하게 된다.
+# route:
+#   group_by: ['emailAlert'] # prometheus.yml의 targets -labels의 label_name을 의미합니다.
+#   group_wait: 10s # # inhibit 조건 발생에 대한 중복을 방지하기 위한 알림 발생 전 대기시간 | default = 30s
+#   group_interval: 300s # 새 알림에 대한 통지를 보내기 전 대기할 시간 | default = 5m
+#   repeat_interval: 1h # 이미 있는 경우 알림을 다시 보내기전에 대기할 시간 | default = 4h
+#   receiver: 'email' # 알림 설정 대상
+
+#   # routes: # 라우트 대상 설정
+#   # - match:
+#   #     alertname: 'test-alram' # alertname이 일치하는 경우 알림 발송 
+#   #   receiver: 'slack-channel'
+
+# receivers:
+# - name: 'email'
+#   email_configs:
+#   - to: '[email protected], [email protected]'
+#     from: '[email protected]'
+#     smarthost: 'email host'
+#     auth_username: 'login username'
+#     auth_identity: 'auth idenetity'
+#     auth_password: 'password'
+#     send_resolved: true # alert 가 해결됬을 시 알림이 보내지는 설정 | default = false
+#     # headers:
+#     #   subject: "Custom Warning: {{ .CommonLabels.job }} container Down"
+#     # html: '{{ template "email_template.html" . }}'
+# inhibit_rules:
+#   - source_match:
+#       severity: 'critical'
+#     target_match:  # 음소거가 되어야 할 대상
+#       severity: 'warning'
+#     equal: ['alertname', 'name'] # value값과 동일한 값을 갖는 경우 inhibit rule 적용
+
+# - name: slack-channel
+#   slack_configs:
+#   - channel: #monitoring
+#     icon_url: https://avatars3.githubusercontent.com/u/3380462
+#     send_resolved: true
+#     title: '{{ template "custom_title" . }}'
+#     text: '{{ template "custom_slack_message" . }}'
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: 'localhost:25'
+  smtp_from: '[email protected]'
+  smtp_auth_username: 'alertmanager'
+  smtp_auth_password: 'password'
+
+# The directory from which notification templates are read.
+templates:
+  - '/etc/alertmanager/template/*.tmpl'
+
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  #
+  # To aggregate by all possible labels use '...' as the sole label name.
+  # This effectively disables aggregation entirely, passing through all
+  # alerts as-is. This is unlikely to be what you want, unless you have
+  # a very low alert volume or your upstream notification system performs
+  # its own grouping. Example: group_by: [...]
+  group_by: ['alertname', 'cluster', 'service']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first
+  # notification.
+  group_wait: 30s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  # resend them.
+  repeat_interval: 3h
+
+  # A default receiver
+  receiver: team-X-mails
+
+  # All the above attributes are inherited by all child routes and can
+  # overwritten on each.
+
+  # The child route trees.
+  routes:
+    # This routes performs a regular expression match on alert labels to
+    # catch alerts that are related to a list of services.
+    - matchers:
+        - service=~"foo1|foo2|baz"
+      receiver: team-X-mails
+      # The service has a sub-route for critical alerts, any alerts
+      # that do not match, i.e. severity != critical, fall-back to the
+      # parent node and are sent to 'team-X-mails'
+      routes:
+        - matchers:
+            - severity="critical"
+          receiver: team-X-pager
+    - matchers:
+        - service="files"
+      receiver: team-Y-mails
+
+      routes:
+        - matchers:
+            - severity="critical"
+          receiver: team-Y-pager
+
+    # This route handles all alerts coming from a database service. If there's
+    # no team to handle it, it defaults to the DB team.
+    - matchers:
+        - service="database"
+      receiver: team-DB-pager
+      # Also group alerts by affected database.
+      group_by: [alertname, cluster, database]
+      routes:
+        - matchers:
+            - owner="team-X"
+          receiver: team-X-pager
+          continue: true
+        - matchers:
+            - owner="team-Y"
+          receiver: team-Y-pager
+
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is
+# already critical.
+inhibit_rules:
+  - source_matchers: [severity="critical"]
+    target_matchers: [severity="warning"]
+    # Apply inhibition if the alertname is the same.
+    # CAUTION:
+    #   If all label names listed in `equal` are missing
+    #   from both the source and target alerts,
+    #   the inhibition rule will apply!
+    equal: [alertname, cluster, service]
+
+
+receivers:
+  - name: 'team-X-mails'
+    email_configs:
+      - to: '[email protected]'
+
+  - name: 'team-X-pager'
+    email_configs:
+      - to: '[email protected]'
+    pagerduty_configs:
+      - service_key: <team-X-key>
+
+  - name: 'team-Y-mails'
+    email_configs:
+      - to: '[email protected]'
+
+  - name: 'team-Y-pager'
+    pagerduty_configs:
+      - service_key: <team-Y-key>
+
+  - name: 'team-DB-pager'
+    pagerduty_configs:
+      - service_key: <team-DB-key>
diff --git a/docker/monitoring/docker-compose_monitoring.yaml b/docker/monitoring/docker-compose_monitoring.yaml
@@ -0,0 +1,47 @@
+version: '3.7'
+
+services:
+  prometheus:
+    image: prom/prometheus
+    container_name: prometheus
+    volumes:
+      - ./prometheus/config/:/etc/prometheus/
+      - ./prometheus/prometheus-volume:/prometheus
+    ports:
+      - 9090:9090
+    command:
+      - "--web.enable-lifecycle"
+      - '--config.file=/etc/prometheus/prometheus.yml'
+    restart: always
+    networks:
+      - promnet
+    user: root
+
+  grafana:
+    image: grafana/grafana
+    container_name: grafana
+    volumes:
+      - ./grafana-volume:/var/lib/grafana
+    restart: always
+    networks:
+      - promnet
+    ports:
+      - 3300:3000
+    user: root
+  alertmanager:
+    image: prom/alertmanager
+    container_name: alertmanager
+    user: root
+    ports: 
+      - 9093:9093
+    volumes:
+      - ./alertmanager/config/:/etc/alertmanager/
+    networks:
+      - promnet
+    restart: always
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yml'
+
+networks:
+  promnet:
+    driver: bridge
diff --git a/docker/monitoring/docker-compose_node_exporter.yaml b/docker/monitoring/docker-compose_node_exporter.yaml
@@ -0,0 +1,12 @@
+version: '3.7'
+services:
+  node:
+   image: prom/node-exporter
+   container_name: node-exporter
+   ports:
+     - 9100:9100
+   networks:
+     - promnet
+networks:
+  promnet:
+    driver: bridge
diff --git a/docker/monitoring/prometheus/config/prometheus.yml b/docker/monitoring/prometheus/config/prometheus.yml
@@ -0,0 +1,55 @@
+# default 값 설정하기 - 여기 부분은 전부 설정 안해줘도 상관없음
+global:
+  scrape_interval: 15s # scrap target의 기본 interval을 15초로 변경 / default = 1m
+  scrape_timeout: 15s # scrap request 가 timeout 나는 길이 / default = 10s
+  evaluation_interval: 2m # rule 을 얼마나 빈번하게 검증하는지 / default = 1m
+
+  # Attach these labels to any time series or alerts when communicating with
+  # external systems (federation, remote storage, Alertmanager).
+  external_labels:
+    monitor: 'codelab-monitor' # 기본적으로 붙여줄 라벨
+  # query_log_file: 로그가저장될파일주소.log # prometheus의 쿼리 로그들을 기록, 없으면 기록안함
+
+# 규칙을 로딩하고 'evaluation_interval' 설정에 따라 정기적으로 평가한다.
+rule_files:
+  - "rule.yml" # 파일위치는 prometheus.yml 이 있는 곳과 동일 위치
+
+# 매트릭을 수집할 엔드포인드로 여기선 Prometheus 서버 자신을 가리킨다.
+# scrape_configs:
+#   # 이 설정에서 수집한 타임시리즈에 `job=<job_name>`으로 잡의 이름을 설정한다.
+#   # metrics_path의 기본 경로는 '/metrics'이고 scheme의 기본값은 `http`다
+#   - job_name: 'monitoring-item' # job_name 은 모든 scrap 내에서 고유해야함
+#     scrape_interval: 10s # global에서 default 값을 정의해주었기 떄문에 안써도됨
+#     scrape_timeout: 10s # global에서 default 값을 정의해주었기 떄문에 안써도됨
+#     metrics_path: '/metrics' # 옵션 - prometheus 가 metrics를 얻기위해 참조하는 uri 를 변경할 수 있음 | default = /metrics
+#     honor_labels: false # 옵션 - 라벨 충동이 있을경우 라벨을 변경할지설정(false일 경우 라벨 안바뀜) | default = false
+#     honor_timestamps: false # 옵션 - honor_labels 이 참일 경우, metrics timestamp가 노출됨(true 일경우) | default = false
+#     scheme: 'http' # 옵션 - request 를 보낼 scheme 설정 | default = http
+#     params: # 옵션 - request 요청 보낼 떄의 param
+#       user-id: ['[email protected]']
+#     static_configs:
+#       - targets: ['localhost:9100']
+scrape_configs:
+  - job_name: 'node_exporter'
+    metrics_path: /metrics
+    static_configs:
+      - targets: ['192.168.0.80:9100']
+    # 그 외에도 authorization 설정 
+    # service discovery 설정(sd)
+
+    # 실제 scrap 하는 타겟에 관한 설정
+    # static_configs:
+    #   - targets: ['192.168.0.80:3300', 'localhost:9100', 'localhost:80'] // prometheus, node-exporter, cadvisor  
+    #     labels: # 옵션 - scrap 해서 가져올 metrics 들 전부에게 붙여줄 라벨
+    #       service : 'monitor-1'
+
+    # relabel_config - 스크랩되기 전의 label들을 수정
+    # metric_relabel_configs - 가져오는 대상들의 레이블들을 동적으로 다시작성하는 설정(drop, replace, labeldrop)
+
+
+# # Alerting specifies settings related to the Alertmanager.
+# alerting:
+#   alert_relabel_configs:
+#     [ - <relabel_config> ... ]
+#   alertmanagers:
+#     [ - <alertmanager_config> ... ]
diff --git a/docker/monitoring/prometheus/config/rule.yml b/docker/monitoring/prometheus/config/rule.yml
@@ -0,0 +1,21 @@
+groups:
+- name: example # 파일 내에서 unique 해야함
+  rules:
+
+  # Alert for any instance that is unreachable for >5 minutes.
+  - alert: InstanceDown
+    expr: up == 0
+    for: 5m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance {{ $labels.instance }} down"
+      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
+
+  # Alert for any instance that has a median request latency >1s.
+  - alert: APIHighRequestLatency
+    expr: api_http_request_latencies_second{quantile="0.5"} > 1
+    for: 10m
+    annotations:
+      summary: "High request latency on {{ $labels.instance }}"
+      description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
diff --git a/docker/monitoring/prometheus/prometheus-volume/data/01HR56HBDPPW8C97WH2DP5EKGF/meta.json b/docker/monitoring/prometheus/prometheus-volume/data/01HR56HBDPPW8C97WH2DP5EKGF/meta.json
@@ -0,0 +1,30 @@
+{
+	"ulid": "01HR56HBDPPW8C97WH2DP5EKGF",
+	"minTime": 1709539243307,
+	"maxTime": 1709553600000,
+	"stats": {
+		"numSamples": 1410133,
+		"numSeries": 1494,
+		"numChunks": 11917
+	},
+	"compaction": {
+		"level": 2,
+		"sources": [
+			"01HR4HZ2TH3SAFBKD7DGA2XY12",
+			"01HR4RSWT42NASZE0XSXF5G73E"
+		],
+		"parents": [
+			{
+				"ulid": "01HR4HZ2TH3SAFBKD7DGA2XY12",
+				"minTime": 1709539243307,
+				"maxTime": 1709546400000
+			},
+			{
+				"ulid": "01HR4RSWT42NASZE0XSXF5G73E",
+				"minTime": 1709546401007,
+				"maxTime": 1709553600000
+			}
+		]
+	},
+	"version": 1
+}