From ada16d59655d3640abc699d77ba8a8131d6f0a43 Mon Sep 17 00:00:00 2001 From: Kevin Lewin <97046295+lewinkedrs@users.noreply.github.com> Date: Wed, 24 Jan 2024 12:09:52 -0500 Subject: [PATCH] feat(module: eks-monitoring) Add NVIDIA gpu monitoring dashboards (#257) * gpu dashboards * fixing locals * doc start * Update gpumon.md * fixing typos and doc names * fixing module name * fixing mkdocs * gpu to nvidia * Apply pre-commit --------- Co-authored-by: Rodrigue Koffi --- docs/eks/gpu-monitoring.md | 38 ++++++++++++++++++++++++++++ mkdocs.yml | 1 + modules/eks-monitoring/README.md | 3 +++ modules/eks-monitoring/dashboards.tf | 20 +++++++++++++++ modules/eks-monitoring/locals.tf | 9 +++++++ modules/eks-monitoring/main.tf | 4 +++ modules/eks-monitoring/variables.tf | 20 +++++++++++++++ 7 files changed, 95 insertions(+) create mode 100644 docs/eks/gpu-monitoring.md diff --git a/docs/eks/gpu-monitoring.md b/docs/eks/gpu-monitoring.md new file mode 100644 index 00000000..8514654d --- /dev/null +++ b/docs/eks/gpu-monitoring.md @@ -0,0 +1,38 @@ +# Monitoring NVIDIA GPU Workloads + +GPUs play an integral part in data intensive workloads. The eks-monitoring module of the Observability Accelerator provides the ability to deploy the NVIDIA DCGM Exporter Dashboard. +The dashboard utilizes metrics scraped from the `/metrics` endpoint that are exposed when running the nvidia gpu operator with the [DCGM exporter](https://developer.nvidia.com/blog/monitoring-gpus-in-kubernetes-with-dcgm/) and NVSMI binary. + +!!!note + In order to make use of this dashboard, you will need to have a GPU backed EKS cluster and deploy the [GPU operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/amazon-eks.html) + The recommended way of deploying the GPU operator is the [Data on EKS Blueprint](https://github.com/aws-ia/terraform-aws-eks-data-addons/blob/main/nvidia-gpu-operator.tf) + +## Deployment + +This is enabled by default in the [eks-monitoring module](https://aws-observability.github.io/terraform-aws-observability-accelerator/eks/). + +## Dashboards + +In order to start producing diagnostic metrics you must first deploy the nvidia SMI binary. nvidia-smi (also NVSMI) provides monitoring and management capabilities for each of NVIDIA’s devices from Fermi and higher architecture families. We can now deploy the nvidia-smi binary, which shows diagnostic information about all GPUs visible to the container: + +``` +cat << EOF | kubectl apply -f - +apiVersion: v1 +kind: Pod +metadata: + name: nvidia-smi +spec: + restartPolicy: OnFailure + containers: + - name: nvidia-smi + image: "nvidia/cuda:11.0.3-base-ubuntu20.04" + args: + - "nvidia-smi" + resources: + limits: + nvidia.com/gpu: 1 +EOF +``` +After producing the metrics they should populate the DCGM exporter dashboard: + +![image](https://github.com/aws-observability/terraform-aws-observability-accelerator/assets/97046295/66e8ae83-3a78-48b8-a9fc-4460a5a4d173) diff --git a/mkdocs.yml b/mkdocs.yml index 918978a0..bf8594b8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -28,6 +28,7 @@ nav: - Amazon EKS: - Infrastructure: eks/index.md - EKS API server: eks/eks-apiserver.md + - EKS GPU montitoring: eks/gpu-monitoring.md - Multicluster: - Single AWS account: eks/multicluster.md - Cross AWS account: eks/multiaccount.md diff --git a/modules/eks-monitoring/README.md b/modules/eks-monitoring/README.md index 5116546d..2899667e 100644 --- a/modules/eks-monitoring/README.md +++ b/modules/eks-monitoring/README.md @@ -61,6 +61,7 @@ See examples using this Terraform modules in the **Amazon EKS** section of [this | [kubectl_manifest.flux_gitrepository](https://registry.terraform.io/providers/alekc/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.flux_kustomization](https://registry.terraform.io/providers/alekc/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.kubeproxy_monitoring_dashboard](https://registry.terraform.io/providers/alekc/kubectl/latest/docs/resources/manifest) | resource | +| [kubectl_manifest.nvidia_monitoring_dashboards](https://registry.terraform.io/providers/alekc/kubectl/latest/docs/resources/manifest) | resource | | [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_eks_cluster.eks_cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster) | data source | | [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source | @@ -93,6 +94,7 @@ See examples using this Terraform modules in the **Amazon EKS** section of [this | [enable\_managed\_prometheus](#input\_enable\_managed\_prometheus) | Creates a new Amazon Managed Service for Prometheus Workspace | `bool` | `true` | no | | [enable\_nginx](#input\_enable\_nginx) | Enable NGINX workloads monitoring, alerting and default dashboards | `bool` | `false` | no | | [enable\_node\_exporter](#input\_enable\_node\_exporter) | Enables or disables Node exporter. Disabling this might affect some data in the dashboards | `bool` | `true` | no | +| [enable\_nvidia\_monitoring](#input\_enable\_nvidia\_monitoring) | Enables monitoring of nvidia metrics | `bool` | `true` | no | | [enable\_recording\_rules](#input\_enable\_recording\_rules) | Enables or disables Managed Prometheus recording rules | `bool` | `true` | no | | [enable\_tracing](#input\_enable\_tracing) | Enables tracing with OTLP traces receiver to X-Ray | `bool` | `true` | no | | [flux\_config](#input\_flux\_config) | FluxCD configuration |
object({
create_namespace = bool
k8s_namespace = string
helm_chart_name = string
helm_chart_version = string
helm_release_name = string
helm_repo_url = string
helm_settings = map(string)
helm_values = map(any)
})
|
{
"create_namespace": true,
"helm_chart_name": "flux2",
"helm_chart_version": "2.12.2",
"helm_release_name": "observability-fluxcd-addon",
"helm_repo_url": "https://fluxcd-community.github.io/helm-charts",
"helm_settings": {},
"helm_values": {},
"k8s_namespace": "flux-system"
}
| no | @@ -127,6 +129,7 @@ See examples using this Terraform modules in the **Amazon EKS** section of [this | [managed\_prometheus\_workspace\_region](#input\_managed\_prometheus\_workspace\_region) | Amazon Managed Prometheus Workspace's Region | `string` | `null` | no | | [ne\_config](#input\_ne\_config) | Node exporter configuration |
object({
create_namespace = bool
k8s_namespace = string
helm_chart_name = string
helm_chart_version = string
helm_release_name = string
helm_repo_url = string
helm_settings = map(string)
helm_values = map(any)

scrape_interval = string
scrape_timeout = string
})
|
{
"create_namespace": true,
"helm_chart_name": "prometheus-node-exporter",
"helm_chart_version": "4.24.0",
"helm_release_name": "prometheus-node-exporter",
"helm_repo_url": "https://prometheus-community.github.io/helm-charts",
"helm_settings": {},
"helm_values": {},
"k8s_namespace": "prometheus-node-exporter",
"scrape_interval": "60s",
"scrape_timeout": "60s"
}
| no | | [nginx\_config](#input\_nginx\_config) | Configuration object for NGINX monitoring |
object({
enable_alerting_rules = bool
enable_recording_rules = bool
enable_dashboards = bool
scrape_sample_limit = number

flux_gitrepository_name = string
flux_gitrepository_url = string
flux_gitrepository_branch = string
flux_kustomization_name = string
flux_kustomization_path = string

grafana_dashboard_url = string

prometheus_metrics_endpoint = string
})
| `null` | no | +| [nvidia\_monitoring\_config](#input\_nvidia\_monitoring\_config) | Config object for nvidia monitoring |
object({
flux_gitrepository_name = string
flux_gitrepository_url = string
flux_gitrepository_branch = string
flux_kustomization_name = string
flux_kustomization_path = string
})
| `null` | no | | [prometheus\_config](#input\_prometheus\_config) | Controls default values such as scrape interval, timeouts and ports globally |
object({
global_scrape_interval = string
global_scrape_timeout = string
})
|
{
"global_scrape_interval": "120s",
"global_scrape_timeout": "15s"
}
| no | | [tags](#input\_tags) | Additional tags (e.g. `map('BusinessUnit`,`XYZ`) | `map(string)` | `{}` | no | | [target\_secret\_name](#input\_target\_secret\_name) | Target secret in Kubernetes to store the Grafana API Key Secret | `string` | `"grafana-admin-credentials"` | no | diff --git a/modules/eks-monitoring/dashboards.tf b/modules/eks-monitoring/dashboards.tf index 6ad13f33..d130e80d 100644 --- a/modules/eks-monitoring/dashboards.tf +++ b/modules/eks-monitoring/dashboards.tf @@ -95,6 +95,26 @@ YAML depends_on = [module.external_secrets] } +# nvidia dashboards +resource "kubectl_manifest" "nvidia_monitoring_dashboards" { + yaml_body = <