-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'develop' into feat/mlflow
- Loading branch information
Showing
120 changed files
with
13,608 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
name: Model Serving | ||
|
||
on: | ||
pull_request: | ||
branches: ['develop'] # develop 브랜치에 pull request될 때 동작 | ||
push: | ||
branches: ['develop'] # develop 브랜치에 push될 때 동작 | ||
|
||
jobs: | ||
ci: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Login to Docker Hub | ||
uses: docker/login-action@v3 | ||
with: | ||
username: ${{secrets.DOCKERHUB_USERNAME}} | ||
password: ${{secrets.DOCKERHUB_PASSWORD}} | ||
- name: Build docker file and setting deploy files | ||
run: | | ||
docker build -t makezenerator/serving:latest serving/ | ||
docker push makezenerator/serving:latest | ||
cd: | ||
runs-on: ubuntu-latest | ||
needs: [ci] | ||
steps: | ||
- name: Connect to server | ||
uses: appleboy/ssh-action@master | ||
with: | ||
host: ${{secrets.SERVER3_HOST}} | ||
username: ${{secrets.SERVER_USERNAME}} | ||
key: ${{secrets.SERVER3_KEY}} | ||
port: ${{secrets.SERVER_PORT}} | ||
script: | | ||
docker login -u "${{secrets.DOCKERHUB_USERNAME}}" -p "${{secrets.DOCKERHUB_PASSWORD}}" | ||
docker stop $(docker ps -a -q) | ||
docker rm -f $(docker ps -a -q) | ||
docker pull makezenerator/serving:latest | ||
docker run -d -p 5050:5050 makezenerator/serving:latest | ||
docker image prune -f |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -173,4 +173,6 @@ index | |
0000* | ||
meta.json | ||
*.tar | ||
*.pth | ||
*.pth | ||
*.onnx | ||
*.pth |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
# global: ## default 값 지정해주기 | ||
# resolve_timeout: 1m # 이 시간이 지나도록 경고가 업데이트되지 않은 경우 해결되었다고 선언 | default : 5m | ||
# # 옵션 - email 관련 기본값설정 | ||
# # 옵션 - slack 관련 기본값 | ||
|
||
# # 템플릿이 있을 경우 템플릿 위치 = alertmanager.yml 위치에서 /templates 안에 만듬 | ||
# templates: | ||
# - '/etc/alertmanager/templates/*.tmpl' | ||
|
||
# # route는 루우팅 트리와 그 하위 트리의 노드를 정의한다. | ||
# # 모든 alert는 최상위 경로에서 라우팅 트리를 들어가며, 하위 노드를 가로지르게 되고, conitnue 가 거짓일 경우 멈추고, 아닌 경우 계속 하위트리의 노드에 일치되는지 검색하게 된다. | ||
# route: | ||
# group_by: ['emailAlert'] # prometheus.yml의 targets -labels의 label_name을 의미합니다. | ||
# group_wait: 10s # # inhibit 조건 발생에 대한 중복을 방지하기 위한 알림 발생 전 대기시간 | default = 30s | ||
# group_interval: 300s # 새 알림에 대한 통지를 보내기 전 대기할 시간 | default = 5m | ||
# repeat_interval: 1h # 이미 있는 경우 알림을 다시 보내기전에 대기할 시간 | default = 4h | ||
# receiver: 'email' # 알림 설정 대상 | ||
|
||
# # routes: # 라우트 대상 설정 | ||
# # - match: | ||
# # alertname: 'test-alram' # alertname이 일치하는 경우 알림 발송 | ||
# # receiver: 'slack-channel' | ||
|
||
# receivers: | ||
# - name: 'email' | ||
# email_configs: | ||
# - to: '[email protected], [email protected]' | ||
# from: '[email protected]' | ||
# smarthost: 'email host' | ||
# auth_username: 'login username' | ||
# auth_identity: 'auth idenetity' | ||
# auth_password: 'password' | ||
# send_resolved: true # alert 가 해결됬을 시 알림이 보내지는 설정 | default = false | ||
# # headers: | ||
# # subject: "Custom Warning: {{ .CommonLabels.job }} container Down" | ||
# # html: '{{ template "email_template.html" . }}' | ||
# inhibit_rules: | ||
# - source_match: | ||
# severity: 'critical' | ||
# target_match: # 음소거가 되어야 할 대상 | ||
# severity: 'warning' | ||
# equal: ['alertname', 'name'] # value값과 동일한 값을 갖는 경우 inhibit rule 적용 | ||
|
||
# - name: slack-channel | ||
# slack_configs: | ||
# - channel: #monitoring | ||
# icon_url: https://avatars3.githubusercontent.com/u/3380462 | ||
# send_resolved: true | ||
# title: '{{ template "custom_title" . }}' | ||
# text: '{{ template "custom_slack_message" . }}' | ||
global: | ||
# The smarthost and SMTP sender used for mail notifications. | ||
smtp_smarthost: 'localhost:25' | ||
smtp_from: '[email protected]' | ||
smtp_auth_username: 'alertmanager' | ||
smtp_auth_password: 'password' | ||
|
||
# The directory from which notification templates are read. | ||
templates: | ||
- '/etc/alertmanager/template/*.tmpl' | ||
|
||
# The root route on which each incoming alert enters. | ||
route: | ||
# The labels by which incoming alerts are grouped together. For example, | ||
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would | ||
# be batched into a single group. | ||
# | ||
# To aggregate by all possible labels use '...' as the sole label name. | ||
# This effectively disables aggregation entirely, passing through all | ||
# alerts as-is. This is unlikely to be what you want, unless you have | ||
# a very low alert volume or your upstream notification system performs | ||
# its own grouping. Example: group_by: [...] | ||
group_by: ['alertname', 'cluster', 'service'] | ||
|
||
# When a new group of alerts is created by an incoming alert, wait at | ||
# least 'group_wait' to send the initial notification. | ||
# This way ensures that you get multiple alerts for the same group that start | ||
# firing shortly after another are batched together on the first | ||
# notification. | ||
group_wait: 30s | ||
|
||
# When the first notification was sent, wait 'group_interval' to send a batch | ||
# of new alerts that started firing for that group. | ||
group_interval: 5m | ||
|
||
# If an alert has successfully been sent, wait 'repeat_interval' to | ||
# resend them. | ||
repeat_interval: 3h | ||
|
||
# A default receiver | ||
receiver: team-X-mails | ||
|
||
# All the above attributes are inherited by all child routes and can | ||
# overwritten on each. | ||
|
||
# The child route trees. | ||
routes: | ||
# This routes performs a regular expression match on alert labels to | ||
# catch alerts that are related to a list of services. | ||
- matchers: | ||
- service=~"foo1|foo2|baz" | ||
receiver: team-X-mails | ||
# The service has a sub-route for critical alerts, any alerts | ||
# that do not match, i.e. severity != critical, fall-back to the | ||
# parent node and are sent to 'team-X-mails' | ||
routes: | ||
- matchers: | ||
- severity="critical" | ||
receiver: team-X-pager | ||
- matchers: | ||
- service="files" | ||
receiver: team-Y-mails | ||
|
||
routes: | ||
- matchers: | ||
- severity="critical" | ||
receiver: team-Y-pager | ||
|
||
# This route handles all alerts coming from a database service. If there's | ||
# no team to handle it, it defaults to the DB team. | ||
- matchers: | ||
- service="database" | ||
receiver: team-DB-pager | ||
# Also group alerts by affected database. | ||
group_by: [alertname, cluster, database] | ||
routes: | ||
- matchers: | ||
- owner="team-X" | ||
receiver: team-X-pager | ||
continue: true | ||
- matchers: | ||
- owner="team-Y" | ||
receiver: team-Y-pager | ||
|
||
|
||
# Inhibition rules allow to mute a set of alerts given that another alert is | ||
# firing. | ||
# We use this to mute any warning-level notifications if the same alert is | ||
# already critical. | ||
inhibit_rules: | ||
- source_matchers: [severity="critical"] | ||
target_matchers: [severity="warning"] | ||
# Apply inhibition if the alertname is the same. | ||
# CAUTION: | ||
# If all label names listed in `equal` are missing | ||
# from both the source and target alerts, | ||
# the inhibition rule will apply! | ||
equal: [alertname, cluster, service] | ||
|
||
|
||
receivers: | ||
- name: 'team-X-mails' | ||
email_configs: | ||
- to: '[email protected]' | ||
|
||
- name: 'team-X-pager' | ||
email_configs: | ||
- to: '[email protected]' | ||
pagerduty_configs: | ||
- service_key: <team-X-key> | ||
|
||
- name: 'team-Y-mails' | ||
email_configs: | ||
- to: '[email protected]' | ||
|
||
- name: 'team-Y-pager' | ||
pagerduty_configs: | ||
- service_key: <team-Y-key> | ||
|
||
- name: 'team-DB-pager' | ||
pagerduty_configs: | ||
- service_key: <team-DB-key> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
version: '3.7' | ||
|
||
services: | ||
prometheus: | ||
image: prom/prometheus | ||
container_name: prometheus | ||
volumes: | ||
- ./prometheus/config/:/etc/prometheus/ | ||
- ./prometheus/prometheus-volume:/prometheus | ||
ports: | ||
- 9090:9090 | ||
command: | ||
- "--web.enable-lifecycle" | ||
- '--config.file=/etc/prometheus/prometheus.yml' | ||
restart: always | ||
networks: | ||
- promnet | ||
user: root | ||
|
||
grafana: | ||
image: grafana/grafana | ||
container_name: grafana | ||
volumes: | ||
- ./grafana-volume:/var/lib/grafana | ||
restart: always | ||
networks: | ||
- promnet | ||
ports: | ||
- 3300:3000 | ||
user: root | ||
alertmanager: | ||
image: prom/alertmanager | ||
container_name: alertmanager | ||
user: root | ||
ports: | ||
- 9093:9093 | ||
volumes: | ||
- ./alertmanager/config/:/etc/alertmanager/ | ||
networks: | ||
- promnet | ||
restart: always | ||
command: | ||
- '--config.file=/etc/alertmanager/alertmanager.yml' | ||
|
||
networks: | ||
promnet: | ||
driver: bridge |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
version: '3.7' | ||
services: | ||
node: | ||
image: prom/node-exporter | ||
container_name: node-exporter | ||
ports: | ||
- 9100:9100 | ||
networks: | ||
- promnet | ||
networks: | ||
promnet: | ||
driver: bridge |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# default 값 설정하기 - 여기 부분은 전부 설정 안해줘도 상관없음 | ||
global: | ||
scrape_interval: 15s # scrap target의 기본 interval을 15초로 변경 / default = 1m | ||
scrape_timeout: 15s # scrap request 가 timeout 나는 길이 / default = 10s | ||
evaluation_interval: 2m # rule 을 얼마나 빈번하게 검증하는지 / default = 1m | ||
|
||
# Attach these labels to any time series or alerts when communicating with | ||
# external systems (federation, remote storage, Alertmanager). | ||
external_labels: | ||
monitor: 'codelab-monitor' # 기본적으로 붙여줄 라벨 | ||
# query_log_file: 로그가저장될파일주소.log # prometheus의 쿼리 로그들을 기록, 없으면 기록안함 | ||
|
||
# 규칙을 로딩하고 'evaluation_interval' 설정에 따라 정기적으로 평가한다. | ||
rule_files: | ||
- "rule.yml" # 파일위치는 prometheus.yml 이 있는 곳과 동일 위치 | ||
|
||
# 매트릭을 수집할 엔드포인드로 여기선 Prometheus 서버 자신을 가리킨다. | ||
# scrape_configs: | ||
# # 이 설정에서 수집한 타임시리즈에 `job=<job_name>`으로 잡의 이름을 설정한다. | ||
# # metrics_path의 기본 경로는 '/metrics'이고 scheme의 기본값은 `http`다 | ||
# - job_name: 'monitoring-item' # job_name 은 모든 scrap 내에서 고유해야함 | ||
# scrape_interval: 10s # global에서 default 값을 정의해주었기 떄문에 안써도됨 | ||
# scrape_timeout: 10s # global에서 default 값을 정의해주었기 떄문에 안써도됨 | ||
# metrics_path: '/metrics' # 옵션 - prometheus 가 metrics를 얻기위해 참조하는 uri 를 변경할 수 있음 | default = /metrics | ||
# honor_labels: false # 옵션 - 라벨 충동이 있을경우 라벨을 변경할지설정(false일 경우 라벨 안바뀜) | default = false | ||
# honor_timestamps: false # 옵션 - honor_labels 이 참일 경우, metrics timestamp가 노출됨(true 일경우) | default = false | ||
# scheme: 'http' # 옵션 - request 를 보낼 scheme 설정 | default = http | ||
# params: # 옵션 - request 요청 보낼 떄의 param | ||
# user-id: ['[email protected]'] | ||
# static_configs: | ||
# - targets: ['localhost:9100'] | ||
scrape_configs: | ||
- job_name: 'node_exporter' | ||
metrics_path: /metrics | ||
static_configs: | ||
- targets: ['192.168.0.80:9100'] | ||
# 그 외에도 authorization 설정 | ||
# service discovery 설정(sd) | ||
|
||
# 실제 scrap 하는 타겟에 관한 설정 | ||
# static_configs: | ||
# - targets: ['192.168.0.80:3300', 'localhost:9100', 'localhost:80'] // prometheus, node-exporter, cadvisor | ||
# labels: # 옵션 - scrap 해서 가져올 metrics 들 전부에게 붙여줄 라벨 | ||
# service : 'monitor-1' | ||
|
||
# relabel_config - 스크랩되기 전의 label들을 수정 | ||
# metric_relabel_configs - 가져오는 대상들의 레이블들을 동적으로 다시작성하는 설정(drop, replace, labeldrop) | ||
|
||
|
||
# # Alerting specifies settings related to the Alertmanager. | ||
# alerting: | ||
# alert_relabel_configs: | ||
# [ - <relabel_config> ... ] | ||
# alertmanagers: | ||
# [ - <alertmanager_config> ... ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
groups: | ||
- name: example # 파일 내에서 unique 해야함 | ||
rules: | ||
|
||
# Alert for any instance that is unreachable for >5 minutes. | ||
- alert: InstanceDown | ||
expr: up == 0 | ||
for: 5m | ||
labels: | ||
severity: page | ||
annotations: | ||
summary: "Instance {{ $labels.instance }} down" | ||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." | ||
|
||
# Alert for any instance that has a median request latency >1s. | ||
- alert: APIHighRequestLatency | ||
expr: api_http_request_latencies_second{quantile="0.5"} > 1 | ||
for: 10m | ||
annotations: | ||
summary: "High request latency on {{ $labels.instance }}" | ||
description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)" |
Oops, something went wrong.