diff --git a/Dockerfile b/Dockerfile index 369d3e6..6a5cb2d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,16 +9,19 @@ RUN CGO_ENABLED=0 GOOS=linux go build -o metricly cmd/collector/main.go # # Use a minimal base image for the final image FROM quay.io/jitesoft/alpine:3.20.3 -WORKDIR /root -COPY --from=builder /app/metricly . +COPY --from=builder /app/metricly /bin/metricly +WORKDIR /metricly COPY ./config/healthcheck . -RUN chmod +x /root/healthcheck + RUN mkdir /etc/metricly +RUN chown -R nobody:nobody /etc/metricly /metricly + # Expose the port EXPOSE 8080 +USER nobody # Run the metrics collector -ENTRYPOINT ["./metricly"] +ENTRYPOINT ["/bin/metricly"] # Default agrs CMD ["--config", "/etc/metricly/config.yaml"] diff --git a/Makefile b/Makefile index 24540b9..27f0815 100644 --- a/Makefile +++ b/Makefile @@ -33,7 +33,7 @@ run_container: -p 8080:8080 \ -v ./config/config.yaml:/etc/metricly/config.yaml:ro,z \ -v /:/host/root:ro,slave \ - --health-cmd "/root/healthcheck metricly" \ + --health-cmd "/metricly/healthcheck metricly" \ -e HOSTNAME=${HOSTNAME} \ -e PROC_CPU_STAT=/host/root/proc/stat \ -e PROC_MEMORY_INFO=/host/root/proc/meminfo \ diff --git a/README.md b/README.md index d63f0c6..0db35c8 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,8 @@ - Logs incoming and outgoing API requests with support for multiple log levels (INFO, DEBUG, ERROR). - **Metrics Visualization** - Provides an inbuilt `Grafana` dashboard to visualize all metrics. +- **Alerting Mechanism** + - Provides an inbuilt `Alertmanager` rules that send Gmail alerts. --- ## **Getting Started** @@ -371,6 +373,17 @@ The Metricly exporter provides the following API endpoints: } ``` +### **Alertmanager Configuration** ### +Metricly provides a few inbuilt alerts to monitor high utilization of CPU, Memory and Disk usage. + +![Sample Alerts](doc/alerts.png) + +Upon meeting condition for any alert, an email notification is sent to the receiver configured in `config/alertmanager/alertmanager.yml` + +![High CPU Alert](doc/high_cpu_alert.png) + +To include more alerts, take a look at `config/prometheus/alerts/`. Similar alerts can be built and added to the same directory. + ### **Development** #### **Testing** diff --git a/alertmanager.yml b/alertmanager.yml new file mode 100644 index 0000000..22f5b36 --- /dev/null +++ b/alertmanager.yml @@ -0,0 +1,15 @@ +global: + resolve_timeout: 5m + +route: + receiver: 'default' + +receivers: + - name: 'default' + email_configs: + - to: 'yadnesh45@gmail.com' + from: 'alertmanager@example.com' + smarthost: 'smtp.gmail.com:587' + auth_username: 'yadnesh45@gmail.com' + auth_identity: 'yadnesh45@gmail.com' + auth_password: 'jqhotwxphmieuaqw' diff --git a/config/alertmanager/alertmanager.yml b/config/alertmanager/alertmanager.yml new file mode 100644 index 0000000..cda86b6 --- /dev/null +++ b/config/alertmanager/alertmanager.yml @@ -0,0 +1,15 @@ +global: + resolve_timeout: 5m + +route: + receiver: 'default' + +receivers: + - name: 'default' + email_configs: + - to: 'example@gmail.com' # receiver's address + from: 'alertmanager@example.com' + smarthost: 'smtp.gmail.com:587' + auth_username: 'example@gmail.com' # sender's address + auth_identity: 'example@gmail.com' # sender's address + auth_password: 'xxxxxxxxxxxxx' # gmail app password generated by sender diff --git a/config/go-runtime.json b/config/grafana/visualizations/go-runtime.json similarity index 100% rename from config/go-runtime.json rename to config/grafana/visualizations/go-runtime.json diff --git a/config/infrastructure.json b/config/grafana/visualizations/infrastructure.json similarity index 100% rename from config/infrastructure.json rename to config/grafana/visualizations/infrastructure.json diff --git a/config/prometheus/alerts/cpu_alerts.yml b/config/prometheus/alerts/cpu_alerts.yml new file mode 100644 index 0000000..d520670 --- /dev/null +++ b/config/prometheus/alerts/cpu_alerts.yml @@ -0,0 +1,20 @@ +groups: + - name: cpu_alerts + rules: + - alert: CPUUsage > 60% + expr: avg_over_time(metricly_cpu_total[5m]) > 60 + for: 1m + labels: + severity: warning + annotations: + summary: "High CPU usage detected" + description: "CPU usage is above 60% for the last 5 minutes on host {{ $labels.host }}" + + - alert: CPUUsage > 80% + expr: avg_over_time(metricly_cpu_total[15m]) > 80 + for: 1m + labels: + severity: critical + annotations: + summary: "High CPU usage detected" + description: "CPU usage is above 80% for the last 15 minutes on host {{ $labels.host }}" diff --git a/config/prometheus/alerts/disk_alerts.yml b/config/prometheus/alerts/disk_alerts.yml new file mode 100644 index 0000000..2b64803 --- /dev/null +++ b/config/prometheus/alerts/disk_alerts.yml @@ -0,0 +1,20 @@ +groups: + - name: disk_alerts + rules: + - alert: Disk Usage > 60% + expr: 100*metricly_disk_used_bytes/metricly_disk_total_bytes > 60 + for: 1m + labels: + severity: warning + annotations: + summary: "High Disk usage detected" + description: "Disk usage is above 60%" + + - alert: Disk Usage > 80% + expr: 100*metricly_disk_used_bytes/metricly_disk_total_bytes > 80 + for: 1m + labels: + severity: critical + annotations: + summary: "High Disk usage detected" + description: "Disk usage is above 80%" diff --git a/config/prometheus/alerts/memory_alerts.yml b/config/prometheus/alerts/memory_alerts.yml new file mode 100644 index 0000000..9e9d521 --- /dev/null +++ b/config/prometheus/alerts/memory_alerts.yml @@ -0,0 +1,20 @@ +groups: + - name: memory_alerts + rules: + - alert: Memory Usage > 60% + expr: 100*(metricly_memory_total_bytes-metricly_memory_available_bytes)/metricly_memory_total_bytes > 60 + for: 1m + labels: + severity: warning + annotations: + summary: "High Memory usage detected" + description: "Memory usage is above 60%" + + - alert: Memory Usage > 80% + expr: 100*(metricly_memory_total_bytes-metricly_memory_available_bytes)/metricly_memory_total_bytes > 80 + for: 1m + labels: + severity: critical + annotations: + summary: "High Memory usage detected" + description: "Memory usage is above 80%" diff --git a/config/prometheus/prometheus.yml b/config/prometheus/prometheus.yml index c7c3418..592ae79 100644 --- a/config/prometheus/prometheus.yml +++ b/config/prometheus/prometheus.yml @@ -8,3 +8,12 @@ scrape_configs: static_configs: - targets: - '127.0.0.1:8080' # Target where your app exposes metrics + +rule_files: + - "/etc/prometheus/alerts/*.yml" + +alerting: + alertmanagers: + - static_configs: + - targets: + - '127.0.0.1:9093' \ No newline at end of file diff --git a/doc/alerts.png b/doc/alerts.png new file mode 100644 index 0000000..d915e70 Binary files /dev/null and b/doc/alerts.png differ diff --git a/doc/high_cpu_alert.png b/doc/high_cpu_alert.png new file mode 100644 index 0000000..042da37 Binary files /dev/null and b/doc/high_cpu_alert.png differ diff --git a/docker-compose.yml b/docker-compose.yml index 9ae55c5..98cf7ec 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,11 +8,6 @@ services: network_mode: host volumes: - ./config/config.yaml:/etc/metricly/config.yaml:ro,z - # - /proc/stat:/proc/stat:ro - # - /proc/meminfo:/proc/meminfo:ro - # - /proc/diskstats:/proc/diskstats:ro - # - /proc/net/dev:/mnt/metricly/dev:ro - # - /proc/self/mounts:/mnt/metricly/mounts:ro - /:/host/root:ro,rslave # Changes in the source (host) are reflected in the container, not vice-versa environment: - HOSTNAME=${HOSTNAME} @@ -22,7 +17,7 @@ services: - PROC_DISK_MOUNTS=/host/root/proc/mounts - PROC_DISK_STATS=/host/root/proc/diskstats healthcheck: - test: ["CMD", "/root/healthcheck metricly"] + test: ["CMD", "/bin/sh /metricly/healthcheck metricly"] interval: 30s timeout: 5s retries: 3 @@ -34,6 +29,7 @@ services: network_mode: host volumes: - ./config/prometheus/:/etc/prometheus/:ro,z + - ./config/prometheus/alerts:/etc/prometheus/alerts:ro,z command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' @@ -52,8 +48,18 @@ services: - GF_SECURITY_ADMIN_PASSWORD=admin volumes: - ./config/grafana/provisioning/:/etc/grafana/provisioning/:ro,z - - ./config/infrastructure.json:/var/lib/grafana/dashboards/infrastructure.json:ro,z - - ./config/go-runtime.json:/var/lib/grafana/dashboards/go-runtime.json:ro,z + - ./config/grafana/visualizations/infrastructure.json:/var/lib/grafana/dashboards/infrastructure.json:ro,z + - ./config/grafana/visualizations/go-runtime.json:/var/lib/grafana/dashboards/go-runtime.json:ro,z restart: always depends_on: - prometheus + + alertmanager: + container_name: metricly_alertmanager + image: docker.io/prom/alertmanager:v0.27.0 + network_mode: host + volumes: + - ./config/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro,z + restart: always + depends_on: + - prometheus \ No newline at end of file