From 3b865641dde3315eae47824f1d1f2ba9bc70facf Mon Sep 17 00:00:00 2001 From: internationalwe <46400961+internationalwe@users.noreply.github.com> Date: Wed, 13 Mar 2024 18:33:26 +0900 Subject: [PATCH] =?UTF-8?q?[FEAT]=20=EB=AA=A8=EB=8D=B8=20=EC=84=9C?= =?UTF-8?q?=EB=B9=99=20=ED=99=98=EA=B2=BD,=20=EC=9B=B9=20=EC=84=9C?= =?UTF-8?q?=EB=B2=84=20=EB=B0=8F=20=EB=AA=A8=EB=8D=B8=20=ED=8D=BC=ED=8F=AC?= =?UTF-8?q?=EB=A8=BC=EC=8A=A4=20=EB=AA=A8=EB=8B=88=ED=84=B0=EB=A7=81=20?= =?UTF-8?q?=EA=B5=AC=EC=B6=95=20(#10)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Feat Prometeous, Grafana DockerFile - - #4 * Feat : Proemetous & Grafana & alertmanager docker compose yaml - - #4 * Feat : alertmanger.yml & prometheus.yml add file - - #4 * Feat : node_exporter.yaml add - - #4 --------- Co-authored-by: 김태양_T6044 <52816769+taeyang916@users.noreply.github.com> --- .gitignore | 11 +- .gitmessage.txt | 4 +- .../alertmanager/config/alertmanager.yml | 172 ++++++++++++++++++ .../monitoring/docker-compose_monitoring.yaml | 47 +++++ .../docker-compose_node_exporter.yaml | 12 ++ .../prometheus/config/prometheus.yml | 55 ++++++ docker/monitoring/prometheus/config/rule.yml | 21 +++ .../data/01HR56HBDPPW8C97WH2DP5EKGF/meta.json | 30 +++ .../data/01HR5V4H98RJTZ8Y5Q9CE2NCWG/meta.json | 36 ++++ .../data/01HR68VZHXPFGDPX8TY4P3334K/meta.json | 17 ++ .../data/01HR6FQPT632CR8H0HD8VWM7S3/meta.json | 17 ++ .../data/01HR6FQPYRNSZVAAWP760QNV5J/meta.json | 36 ++++ 12 files changed, 454 insertions(+), 4 deletions(-) create mode 100644 docker/monitoring/alertmanager/config/alertmanager.yml create mode 100644 docker/monitoring/docker-compose_monitoring.yaml create mode 100644 docker/monitoring/docker-compose_node_exporter.yaml create mode 100644 docker/monitoring/prometheus/config/prometheus.yml create mode 100644 docker/monitoring/prometheus/config/rule.yml create mode 100644 docker/monitoring/prometheus/prometheus-volume/data/01HR56HBDPPW8C97WH2DP5EKGF/meta.json create mode 100644 docker/monitoring/prometheus/prometheus-volume/data/01HR5V4H98RJTZ8Y5Q9CE2NCWG/meta.json create mode 100644 docker/monitoring/prometheus/prometheus-volume/data/01HR68VZHXPFGDPX8TY4P3334K/meta.json create mode 100644 docker/monitoring/prometheus/prometheus-volume/data/01HR6FQPT632CR8H0HD8VWM7S3/meta.json create mode 100644 docker/monitoring/prometheus/prometheus-volume/data/01HR6FQPYRNSZVAAWP760QNV5J/meta.json diff --git a/.gitignore b/.gitignore index 2ea3439..ec48707 100644 --- a/.gitignore +++ b/.gitignore @@ -158,6 +158,13 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ - +grafana.db +000000* +lock +queries.active +tombstones +prometheus/prometheus-volume/data/* +index +0000* *.onnx -*.pth \ No newline at end of file +*.pth diff --git a/.gitmessage.txt b/.gitmessage.txt index 4c4db38..8c37030 100644 --- a/.gitmessage.txt +++ b/.gitmessage.txt @@ -2,7 +2,7 @@ # <타입> : <제목> 의 형식으로 제목을 아래 공백줄에 작성 # 타입의 첫 문자는 대문자 / 제목은 50자 이내 / 변경사항이 "무엇"인지 명확히 작성 / 끝에 마침표 금지 # 예) Feat : Add login - +Feat : alertmanger.yml & prometheus.yml add file # 바로 아래 공백은 지우지 마세요 (제목과 본문의 분리를 위함) ################ @@ -12,7 +12,7 @@ ################ # 꼬릿말(footer)을 아랫줄에 작성 (현재 커밋과 관련된 이슈 번호 추가 등) # 예) #7 -- +- #4 ################ # Feat : 새로운 기능 추가 # Fix : 버그 수정 diff --git a/docker/monitoring/alertmanager/config/alertmanager.yml b/docker/monitoring/alertmanager/config/alertmanager.yml new file mode 100644 index 0000000..8fcfb09 --- /dev/null +++ b/docker/monitoring/alertmanager/config/alertmanager.yml @@ -0,0 +1,172 @@ +# global: ## default 값 지정해주기 +# resolve_timeout: 1m # 이 시간이 지나도록 경고가 업데이트되지 않은 경우 해결되었다고 선언 | default : 5m +# # 옵션 - email 관련 기본값설정 +# # 옵션 - slack 관련 기본값 + +# # 템플릿이 있을 경우 템플릿 위치 = alertmanager.yml 위치에서 /templates 안에 만듬 +# templates: +# - '/etc/alertmanager/templates/*.tmpl' + +# # route는 루우팅 트리와 그 하위 트리의 노드를 정의한다. +# # 모든 alert는 최상위 경로에서 라우팅 트리를 들어가며, 하위 노드를 가로지르게 되고, conitnue 가 거짓일 경우 멈추고, 아닌 경우 계속 하위트리의 노드에 일치되는지 검색하게 된다. +# route: +# group_by: ['emailAlert'] # prometheus.yml의 targets -labels의 label_name을 의미합니다. +# group_wait: 10s # # inhibit 조건 발생에 대한 중복을 방지하기 위한 알림 발생 전 대기시간 | default = 30s +# group_interval: 300s # 새 알림에 대한 통지를 보내기 전 대기할 시간 | default = 5m +# repeat_interval: 1h # 이미 있는 경우 알림을 다시 보내기전에 대기할 시간 | default = 4h +# receiver: 'email' # 알림 설정 대상 + +# # routes: # 라우트 대상 설정 +# # - match: +# # alertname: 'test-alram' # alertname이 일치하는 경우 알림 발송 +# # receiver: 'slack-channel' + +# receivers: +# - name: 'email' +# email_configs: +# - to: 'a@a.com, b@b.com' +# from: 'sender@c.com' +# smarthost: 'email host' +# auth_username: 'login username' +# auth_identity: 'auth idenetity' +# auth_password: 'password' +# send_resolved: true # alert 가 해결됬을 시 알림이 보내지는 설정 | default = false +# # headers: +# # subject: "Custom Warning: {{ .CommonLabels.job }} container Down" +# # html: '{{ template "email_template.html" . }}' +# inhibit_rules: +# - source_match: +# severity: 'critical' +# target_match: # 음소거가 되어야 할 대상 +# severity: 'warning' +# equal: ['alertname', 'name'] # value값과 동일한 값을 갖는 경우 inhibit rule 적용 + +# - name: slack-channel +# slack_configs: +# - channel: #monitoring +# icon_url: https://avatars3.githubusercontent.com/u/3380462 +# send_resolved: true +# title: '{{ template "custom_title" . }}' +# text: '{{ template "custom_slack_message" . }}' +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'localhost:25' + smtp_from: 'alertmanager@example.org' + smtp_auth_username: 'alertmanager' + smtp_auth_password: 'password' + +# The directory from which notification templates are read. +templates: + - '/etc/alertmanager/template/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + # + # To aggregate by all possible labels use '...' as the sole label name. + # This effectively disables aggregation entirely, passing through all + # alerts as-is. This is unlikely to be what you want, unless you have + # a very low alert volume or your upstream notification system performs + # its own grouping. Example: group_by: [...] + group_by: ['alertname', 'cluster', 'service'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # A default receiver + receiver: team-X-mails + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + + # The child route trees. + routes: + # This routes performs a regular expression match on alert labels to + # catch alerts that are related to a list of services. + - matchers: + - service=~"foo1|foo2|baz" + receiver: team-X-mails + # The service has a sub-route for critical alerts, any alerts + # that do not match, i.e. severity != critical, fall-back to the + # parent node and are sent to 'team-X-mails' + routes: + - matchers: + - severity="critical" + receiver: team-X-pager + - matchers: + - service="files" + receiver: team-Y-mails + + routes: + - matchers: + - severity="critical" + receiver: team-Y-pager + + # This route handles all alerts coming from a database service. If there's + # no team to handle it, it defaults to the DB team. + - matchers: + - service="database" + receiver: team-DB-pager + # Also group alerts by affected database. + group_by: [alertname, cluster, database] + routes: + - matchers: + - owner="team-X" + receiver: team-X-pager + continue: true + - matchers: + - owner="team-Y" + receiver: team-Y-pager + + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: + - source_matchers: [severity="critical"] + target_matchers: [severity="warning"] + # Apply inhibition if the alertname is the same. + # CAUTION: + # If all label names listed in `equal` are missing + # from both the source and target alerts, + # the inhibition rule will apply! + equal: [alertname, cluster, service] + + +receivers: + - name: 'team-X-mails' + email_configs: + - to: 'team-X+alerts@example.org' + + - name: 'team-X-pager' + email_configs: + - to: 'team-X+alerts-critical@example.org' + pagerduty_configs: + - service_key: + + - name: 'team-Y-mails' + email_configs: + - to: 'team-Y+alerts@example.org' + + - name: 'team-Y-pager' + pagerduty_configs: + - service_key: + + - name: 'team-DB-pager' + pagerduty_configs: + - service_key: \ No newline at end of file diff --git a/docker/monitoring/docker-compose_monitoring.yaml b/docker/monitoring/docker-compose_monitoring.yaml new file mode 100644 index 0000000..48c5256 --- /dev/null +++ b/docker/monitoring/docker-compose_monitoring.yaml @@ -0,0 +1,47 @@ +version: '3.7' + +services: + prometheus: + image: prom/prometheus + container_name: prometheus + volumes: + - ./prometheus/config/:/etc/prometheus/ + - ./prometheus/prometheus-volume:/prometheus + ports: + - 9090:9090 + command: + - "--web.enable-lifecycle" + - '--config.file=/etc/prometheus/prometheus.yml' + restart: always + networks: + - promnet + user: root + + grafana: + image: grafana/grafana + container_name: grafana + volumes: + - ./grafana-volume:/var/lib/grafana + restart: always + networks: + - promnet + ports: + - 3300:3000 + user: root + alertmanager: + image: prom/alertmanager + container_name: alertmanager + user: root + ports: + - 9093:9093 + volumes: + - ./alertmanager/config/:/etc/alertmanager/ + networks: + - promnet + restart: always + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + +networks: + promnet: + driver: bridge \ No newline at end of file diff --git a/docker/monitoring/docker-compose_node_exporter.yaml b/docker/monitoring/docker-compose_node_exporter.yaml new file mode 100644 index 0000000..b757dde --- /dev/null +++ b/docker/monitoring/docker-compose_node_exporter.yaml @@ -0,0 +1,12 @@ +version: '3.7' +services: + node: + image: prom/node-exporter + container_name: node-exporter + ports: + - 9100:9100 + networks: + - promnet +networks: + promnet: + driver: bridge \ No newline at end of file diff --git a/docker/monitoring/prometheus/config/prometheus.yml b/docker/monitoring/prometheus/config/prometheus.yml new file mode 100644 index 0000000..b65d03f --- /dev/null +++ b/docker/monitoring/prometheus/config/prometheus.yml @@ -0,0 +1,55 @@ +# default 값 설정하기 - 여기 부분은 전부 설정 안해줘도 상관없음 +global: + scrape_interval: 15s # scrap target의 기본 interval을 15초로 변경 / default = 1m + scrape_timeout: 15s # scrap request 가 timeout 나는 길이 / default = 10s + evaluation_interval: 2m # rule 을 얼마나 빈번하게 검증하는지 / default = 1m + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'codelab-monitor' # 기본적으로 붙여줄 라벨 + # query_log_file: 로그가저장될파일주소.log # prometheus의 쿼리 로그들을 기록, 없으면 기록안함 + +# 규칙을 로딩하고 'evaluation_interval' 설정에 따라 정기적으로 평가한다. +rule_files: + - "rule.yml" # 파일위치는 prometheus.yml 이 있는 곳과 동일 위치 + +# 매트릭을 수집할 엔드포인드로 여기선 Prometheus 서버 자신을 가리킨다. +# scrape_configs: +# # 이 설정에서 수집한 타임시리즈에 `job=`으로 잡의 이름을 설정한다. +# # metrics_path의 기본 경로는 '/metrics'이고 scheme의 기본값은 `http`다 +# - job_name: 'monitoring-item' # job_name 은 모든 scrap 내에서 고유해야함 +# scrape_interval: 10s # global에서 default 값을 정의해주었기 떄문에 안써도됨 +# scrape_timeout: 10s # global에서 default 값을 정의해주었기 떄문에 안써도됨 +# metrics_path: '/metrics' # 옵션 - prometheus 가 metrics를 얻기위해 참조하는 uri 를 변경할 수 있음 | default = /metrics +# honor_labels: false # 옵션 - 라벨 충동이 있을경우 라벨을 변경할지설정(false일 경우 라벨 안바뀜) | default = false +# honor_timestamps: false # 옵션 - honor_labels 이 참일 경우, metrics timestamp가 노출됨(true 일경우) | default = false +# scheme: 'http' # 옵션 - request 를 보낼 scheme 설정 | default = http +# params: # 옵션 - request 요청 보낼 떄의 param +# user-id: ['tlsghwns1122@gmail.com'] +# static_configs: +# - targets: ['localhost:9100'] +scrape_configs: + - job_name: 'node_exporter' + metrics_path: /metrics + static_configs: + - targets: ['192.168.0.80:9100'] + # 그 외에도 authorization 설정 + # service discovery 설정(sd) + + # 실제 scrap 하는 타겟에 관한 설정 + # static_configs: + # - targets: ['192.168.0.80:3300', 'localhost:9100', 'localhost:80'] // prometheus, node-exporter, cadvisor + # labels: # 옵션 - scrap 해서 가져올 metrics 들 전부에게 붙여줄 라벨 + # service : 'monitor-1' + + # relabel_config - 스크랩되기 전의 label들을 수정 + # metric_relabel_configs - 가져오는 대상들의 레이블들을 동적으로 다시작성하는 설정(drop, replace, labeldrop) + + +# # Alerting specifies settings related to the Alertmanager. +# alerting: +# alert_relabel_configs: +# [ - ... ] +# alertmanagers: +# [ - ... ] \ No newline at end of file diff --git a/docker/monitoring/prometheus/config/rule.yml b/docker/monitoring/prometheus/config/rule.yml new file mode 100644 index 0000000..1155e3a --- /dev/null +++ b/docker/monitoring/prometheus/config/rule.yml @@ -0,0 +1,21 @@ +groups: +- name: example # 파일 내에서 unique 해야함 + rules: + + # Alert for any instance that is unreachable for >5 minutes. + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + severity: page + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." + + # Alert for any instance that has a median request latency >1s. + - alert: APIHighRequestLatency + expr: api_http_request_latencies_second{quantile="0.5"} > 1 + for: 10m + annotations: + summary: "High request latency on {{ $labels.instance }}" + description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)" \ No newline at end of file diff --git a/docker/monitoring/prometheus/prometheus-volume/data/01HR56HBDPPW8C97WH2DP5EKGF/meta.json b/docker/monitoring/prometheus/prometheus-volume/data/01HR56HBDPPW8C97WH2DP5EKGF/meta.json new file mode 100644 index 0000000..4b532f9 --- /dev/null +++ b/docker/monitoring/prometheus/prometheus-volume/data/01HR56HBDPPW8C97WH2DP5EKGF/meta.json @@ -0,0 +1,30 @@ +{ + "ulid": "01HR56HBDPPW8C97WH2DP5EKGF", + "minTime": 1709539243307, + "maxTime": 1709553600000, + "stats": { + "numSamples": 1410133, + "numSeries": 1494, + "numChunks": 11917 + }, + "compaction": { + "level": 2, + "sources": [ + "01HR4HZ2TH3SAFBKD7DGA2XY12", + "01HR4RSWT42NASZE0XSXF5G73E" + ], + "parents": [ + { + "ulid": "01HR4HZ2TH3SAFBKD7DGA2XY12", + "minTime": 1709539243307, + "maxTime": 1709546400000 + }, + { + "ulid": "01HR4RSWT42NASZE0XSXF5G73E", + "minTime": 1709546401007, + "maxTime": 1709553600000 + } + ] + }, + "version": 1 +} \ No newline at end of file diff --git a/docker/monitoring/prometheus/prometheus-volume/data/01HR5V4H98RJTZ8Y5Q9CE2NCWG/meta.json b/docker/monitoring/prometheus/prometheus-volume/data/01HR5V4H98RJTZ8Y5Q9CE2NCWG/meta.json new file mode 100644 index 0000000..57f2937 --- /dev/null +++ b/docker/monitoring/prometheus/prometheus-volume/data/01HR5V4H98RJTZ8Y5Q9CE2NCWG/meta.json @@ -0,0 +1,36 @@ +{ + "ulid": "01HR5V4H98RJTZ8Y5Q9CE2NCWG", + "minTime": 1709553601007, + "maxTime": 1709575200000, + "stats": { + "numSamples": 2144160, + "numSeries": 1489, + "numChunks": 17868 + }, + "compaction": { + "level": 2, + "sources": [ + "01HR4ZNM45ZPGGX2REKMVD7VQD", + "01HR56HB91DDEH98FPB4HVS60V", + "01HR5DD2H2VFTJN5BYZPRZCQ91" + ], + "parents": [ + { + "ulid": "01HR4ZNM45ZPGGX2REKMVD7VQD", + "minTime": 1709553601007, + "maxTime": 1709560800000 + }, + { + "ulid": "01HR56HB91DDEH98FPB4HVS60V", + "minTime": 1709560801007, + "maxTime": 1709568000000 + }, + { + "ulid": "01HR5DD2H2VFTJN5BYZPRZCQ91", + "minTime": 1709568001007, + "maxTime": 1709575200000 + } + ] + }, + "version": 1 +} \ No newline at end of file diff --git a/docker/monitoring/prometheus/prometheus-volume/data/01HR68VZHXPFGDPX8TY4P3334K/meta.json b/docker/monitoring/prometheus/prometheus-volume/data/01HR68VZHXPFGDPX8TY4P3334K/meta.json new file mode 100644 index 0000000..d195d46 --- /dev/null +++ b/docker/monitoring/prometheus/prometheus-volume/data/01HR68VZHXPFGDPX8TY4P3334K/meta.json @@ -0,0 +1,17 @@ +{ + "ulid": "01HR68VZHXPFGDPX8TY4P3334K", + "minTime": 1709596801007, + "maxTime": 1709604000000, + "stats": { + "numSamples": 714720, + "numSeries": 1489, + "numChunks": 5956 + }, + "compaction": { + "level": 1, + "sources": [ + "01HR68VZHXPFGDPX8TY4P3334K" + ] + }, + "version": 1 +} \ No newline at end of file diff --git a/docker/monitoring/prometheus/prometheus-volume/data/01HR6FQPT632CR8H0HD8VWM7S3/meta.json b/docker/monitoring/prometheus/prometheus-volume/data/01HR6FQPT632CR8H0HD8VWM7S3/meta.json new file mode 100644 index 0000000..ded0cc8 --- /dev/null +++ b/docker/monitoring/prometheus/prometheus-volume/data/01HR6FQPT632CR8H0HD8VWM7S3/meta.json @@ -0,0 +1,17 @@ +{ + "ulid": "01HR6FQPT632CR8H0HD8VWM7S3", + "minTime": 1709604001007, + "maxTime": 1709611200000, + "stats": { + "numSamples": 714720, + "numSeries": 1489, + "numChunks": 5956 + }, + "compaction": { + "level": 1, + "sources": [ + "01HR6FQPT632CR8H0HD8VWM7S3" + ] + }, + "version": 1 +} \ No newline at end of file diff --git a/docker/monitoring/prometheus/prometheus-volume/data/01HR6FQPYRNSZVAAWP760QNV5J/meta.json b/docker/monitoring/prometheus/prometheus-volume/data/01HR6FQPYRNSZVAAWP760QNV5J/meta.json new file mode 100644 index 0000000..e753660 --- /dev/null +++ b/docker/monitoring/prometheus/prometheus-volume/data/01HR6FQPYRNSZVAAWP760QNV5J/meta.json @@ -0,0 +1,36 @@ +{ + "ulid": "01HR6FQPYRNSZVAAWP760QNV5J", + "minTime": 1709575201007, + "maxTime": 1709596800000, + "stats": { + "numSamples": 2144160, + "numSeries": 1489, + "numChunks": 17868 + }, + "compaction": { + "level": 2, + "sources": [ + "01HR5M8ST9VDV23TKSMS4SE8JE", + "01HR5V4H3AK72PS98376WR83WS", + "01HR6208A2KW3WJ8KAS68MP5E5" + ], + "parents": [ + { + "ulid": "01HR5M8ST9VDV23TKSMS4SE8JE", + "minTime": 1709575201007, + "maxTime": 1709582400000 + }, + { + "ulid": "01HR5V4H3AK72PS98376WR83WS", + "minTime": 1709582401007, + "maxTime": 1709589600000 + }, + { + "ulid": "01HR6208A2KW3WJ8KAS68MP5E5", + "minTime": 1709589601007, + "maxTime": 1709596800000 + } + ] + }, + "version": 1 +} \ No newline at end of file