Skip to content

Commit

Permalink
Merge pull request #16 from Make-Zenerator/develop
Browse files Browse the repository at this point in the history
[Feat] V2 pull request
  • Loading branch information
internationalwe authored Sep 10, 2024
2 parents 580e7b1 + 6b27b40 commit 8d76398
Show file tree
Hide file tree
Showing 237 changed files with 42,733 additions and 2 deletions.
41 changes: 41 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: Model Serving

on:
pull_request:
branches: ['develop'] # develop 브랜치에 pull request될 때 동작
push:
branches: ['develop'] # develop 브랜치에 push될 때 동작

jobs:
ci:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{secrets.DOCKERHUB_USERNAME}}
password: ${{secrets.DOCKERHUB_PASSWORD}}
- name: Build docker file and setting deploy files
run: |
docker build -t makezenerator/serving:latest serving/
docker push makezenerator/serving:latest
cd:
runs-on: ubuntu-latest
needs: [ci]
steps:
- name: Connect to server
uses: appleboy/ssh-action@master
with:
host: ${{secrets.SERVER3_HOST}}
username: ${{secrets.SERVER_USERNAME}}
key: ${{secrets.SERVER3_KEY}}
port: ${{secrets.SERVER_PORT}}
script: |
docker login -u "${{secrets.DOCKERHUB_USERNAME}}" -p "${{secrets.DOCKERHUB_PASSWORD}}"
docker stop $(docker ps -a -q)
docker rm -f $(docker ps -a -q)
docker pull makezenerator/serving:latest
docker run -d -p 5050:5050 makezenerator/serving:latest
docker image prune -f
26 changes: 26 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,35 @@ dmypy.json
# Cython debug symbols
cython_debug/

#Model weight
*.pt
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
wandb/
*.pkl
events.out*
grafana.db
000000*
lock
queries.active
tombstones
prometheus/prometheus-volume/data/*
index
0000*
*.onnx
*.pth
serving/sf2f/temp.ipynb
.env.serving
docker/monitoring/prometheus/prometheus-volume/data/*
docker/monitoring/grafana-volume/*

meta.json
*.tar
*.pth

pipeline/docker/monitoring/prometheus/prometheus-volume/data/*
pipeline/docker/monitoring/grafana-volume/*
4 changes: 2 additions & 2 deletions .gitmessage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# <타입> : <제목> 의 형식으로 제목을 아래 공백줄에 작성
# 타입의 첫 문자는 대문자 / 제목은 50자 이내 / 변경사항이 "무엇"인지 명확히 작성 / 끝에 마침표 금지
# 예) Feat : Add login

Feat : alertmanger.yml & prometheus.yml add file
# 바로 아래 공백은 지우지 마세요 (제목과 본문의 분리를 위함)

################
Expand All @@ -12,7 +12,7 @@
################
# 꼬릿말(footer)을 아랫줄에 작성 (현재 커밋과 관련된 이슈 번호 추가 등)
# 예) #7
-
- #4
################
# Feat : 새로운 기능 추가
# Fix : 버그 수정
Expand Down
14 changes: 14 additions & 0 deletions docker/mlflow/DockerFile.mlflow
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM amd64/python:3.9-slim

RUN apt-get update && apt-get install -y \
git \
wget \
&& rm -rf /var/lib/apt/lists/*

RUN pip install -U pip &&\
pip install boto3==1.26.8 mlflow==1.30.0 psycopg2-binary

RUN cd /tmp && \
wget https://dl.min.io/client/mc/release/linux-amd64/mc && \
chmod +x mc && \
mv mc /usr/bin/mc
58 changes: 58 additions & 0 deletions docker/mlflow/docker-compose_mlflow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
version: "3"

services:
mlflow-backend-store:
image: postgres:14.0
container_name: mlflow-backend-store
environment:
POSTGRES_USER: mlflowuser
POSTGRES_PASSWORD: mlflowpassword
POSTGRES_DB: mlflowdatabase
healthcheck:
test: ["CMD", "pg_isready", "-q", "-U", "mlflowuser", "-d", "mlflowdatabase"]
interval: 10s
timeout: 5s
retries: 5

mlflow-artifact-store:
image: minio/minio:RELEASE.2024-01-18T22-51-28Z
container_name: mlflow-artifact-store
ports:
- 9000:9000
- 9001:9001
environment:
MINIO_ROOT_USER: minio
MINIO_ROOT_PASSWORD: miniostorage
command: server /data/minio --console-address :9001
healthcheck:
test: ["CMD", "mc", "ready", "local"]
interval: 5s
timeout: 5s
retries: 5

mlflow-server:
build:
context: .
dockerfile: DockerFile_mlflow
container_name: mlflow-server
depends_on:
mlflow-backend-store:
condition: service_healthy
mlflow-artifact-store:
condition: service_healthy
ports:
- 5001:5000
environment:
AWS_ACCESS_KEY: AKIA3FLD32HPRN22NJQ7
AWS_SECRET_ACCESS_KEY: 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
MLFLOW_S3_ENDPOINT_URL: http://mlflow-artifact-store:9000
command:
- /bin/sh
- -c
- |
mc config host add mlflowminio http://mlflow-artifact-store:9000 minio miniostorage &&
mc mb --ignore-existing mlflowminio/mlflow
mlflow server \
--backend-store-uri postgresql://mlflowuser:mlflowpassword@mlflow-backend-store/mlflowdatabase \
--default-artifact-root s3://mlflow/ \
--host 0.0.0.0
35 changes: 35 additions & 0 deletions docker/monitoring/alertmanager/config/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
global:
resolve_timeout: 1m

route:
group_by: ['alertname', 'instance']
group_wait: 10s
group_interval: 5m
repeat_interval: 1h
receiver: 'slack-notifications'
routes:
- match:
severity: 'critical'
receiver: 'slack-notifications'

receivers:
- name: 'slack-notifications'
slack_configs:
- api_url: 'https://discord.com/api/webhooks/1222157548657049611/TAhDV5DnL1sAVNBYJivf3CYe7877PKoBSsp0QZ9DgEMaVNaslR6wlBZuaSmk6NiQZ7zZ' # Discord webhook URL을 여기에 입력하세요.
channel: '#alerts'
send_resolved: true
title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Monitoring Alert - {{ .CommonLabels.alertname }} for {{ .CommonLabels.instance }}'
text: >-
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }}\n
*Description:* {{ .Annotations.description }}\n
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* {{ .Value }}\n{{ end }}
{{ end }}
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
47 changes: 47 additions & 0 deletions docker/monitoring/docker-compose_monitoring.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
version: '3.7'

services:
prometheus:
image: prom/prometheus
container_name: prometheus
volumes:
- ./prometheus/config/:/etc/prometheus/
- ./prometheus/prometheus-volume:/prometheus
ports:
- 9090:9090
command:
- "--web.enable-lifecycle"
- '--config.file=/etc/prometheus/prometheus.yml'
restart: always
networks:
- promnet
user: root

grafana:
image: grafana/grafana
container_name: grafana
volumes:
- ./grafana-volume:/var/lib/grafana
restart: always
networks:
- promnet
ports:
- 3300:3000
user: root
alertmanager:
image: prom/alertmanager
container_name: alertmanager
user: root
ports:
- 9093:9093
volumes:
- ./alertmanager/config/:/etc/alertmanager/
networks:
- promnet
restart: always
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'

networks:
promnet:
driver: bridge
26 changes: 26 additions & 0 deletions docker/monitoring/docker-compose_node_exporter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
version: '3.7'
services:
node:
image: prom/node-exporter
container_name: node-exporter
ports:
- 9100:9100
networks:
- promnet
dcgm:
image : nvcr.io/nvidia/k8s/dcgm-exporter:3.2.6-3.1.9-ubuntu20.04
container_name : dcgm-exporter
ports:
- 9400:9400
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
networks:
- promnet
networks:
promnet:
driver: bridge
55 changes: 55 additions & 0 deletions docker/monitoring/prometheus/config/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# default 값 설정하기 - 여기 부분은 전부 설정 안해줘도 상관없음
global:
scrape_interval: 15s # scrap target의 기본 interval을 15초로 변경 / default = 1m
scrape_timeout: 15s # scrap request 가 timeout 나는 길이 / default = 10s
evaluation_interval: 2m # rule 을 얼마나 빈번하게 검증하는지 / default = 1m

# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'codelab-monitor' # 기본적으로 붙여줄 라벨
# query_log_file: 로그가저장될파일주소.log # prometheus의 쿼리 로그들을 기록, 없으면 기록안함

# 규칙을 로딩하고 'evaluation_interval' 설정에 따라 정기적으로 평가한다.
rule_files:
- "rule.yml" # 파일위치는 prometheus.yml 이 있는 곳과 동일 위치

# 매트릭을 수집할 엔드포인드로 여기선 Prometheus 서버 자신을 가리킨다.

scrape_configs:
- job_name: 'inference_node_exporter'
metrics_path: /metrics
static_configs:
- targets: ['175.45.193.25:9100']
- job_name: 'web_node_exporter'
metrics_path: /metrics
static_configs:
- targets: ['175.45.194.59:9100']
- job_name: 'minio_node_exporter'
metrics_path: /metrics
static_configs:
- targets: ['223.130.133.236:9100']
- job_name: 'gpu_exporter'
metrics_path: /metrics
static_configs:
- targets: ['175.45.193.25:9400']

# 그 외에도 authorization 설정
# service discovery 설정(sd)

# 실제 scrap 하는 타겟에 관한 설정
# static_configs:
# - targets: ['192.168.0.80:3300', 'localhost:9100', 'localhost:80'] // prometheus, node-exporter, cadvisor
# labels: # 옵션 - scrap 해서 가져올 metrics 들 전부에게 붙여줄 라벨
# service : 'monitor-1'

# relabel_config - 스크랩되기 전의 label들을 수정
# metric_relabel_configs - 가져오는 대상들의 레이블들을 동적으로 다시작성하는 설정(drop, replace, labeldrop)


# # Alerting specifies settings related to the Alertmanager.
# alerting:
# alert_relabel_configs:
# [ - <relabel_config> ... ]
# alertmanagers:
# [ - <alertmanager_config> ... ]
21 changes: 21 additions & 0 deletions docker/monitoring/prometheus/config/rule.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
groups:
- name: example # 파일 내에서 unique 해야함
rules:

# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."

# Alert for any instance that has a median request latency >1s.
- alert: APIHighRequestLatency
expr: api_http_request_latencies_second{quantile="0.5"} > 1
for: 10m
annotations:
summary: "High request latency on {{ $labels.instance }}"
description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
23 changes: 23 additions & 0 deletions docker/pipeline/Dockerfile.sf2f
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM pytorch/pytorch:2.2.1-cuda11.8-cudnn8-runtime

ENV base_path /workspace/
COPY . ${base_path}

RUN apt update -y
RUN apt install -y gcc

RUN pip install glog \
python_speech_features \
webrtcvad \
pydub \
mlflow \
minio \
boto3 \
flask \
flask_cors \
gunicorn \
python-dotenv

EXPOSE 3002

# CMD [ "python", "inference.py" ]
Loading

0 comments on commit 8d76398

Please sign in to comment.