diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/Chart.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/Chart.yaml new file mode 100644 index 00000000..cbcd5cae --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/Chart.yaml @@ -0,0 +1,25 @@ +apiVersion: v2 +name: prometheus-node-exporter +description: A Helm chart for prometheus node-exporter +keywords: + - node-exporter + - prometheus + - exporter +type: application +version: 4.34.0 +appVersion: 1.8.0 +home: https://github.com/prometheus/node_exporter/ +sources: + - https://github.com/prometheus/node_exporter/ +maintainers: + - email: gianrubio@gmail.com + name: gianrubio + - email: zanhsieh@gmail.com + name: zanhsieh + - email: rootsandtrees@posteo.de + name: zeritti +annotations: + "artifacthub.io/license": Apache-2.0 + "artifacthub.io/links": | + - name: Chart Source + url: https://github.com/prometheus-community/helm-charts diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/README.md b/4.validation_and_observability/3.efa-node-exporter/EKS/README.md new file mode 100644 index 00000000..cd2b8557 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/README.md @@ -0,0 +1,259 @@ +# EFA node Exporter for Prometheus on EKS + +Here we will show how to setup the EFA node Exporter for Prometheus on an Amazon EKS cluster with these [helm-charts](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter). + +# 1. Environment variables + +Export these variables to setup your environment first: + +```bash +export AWS_REGION=us-west-2 +export ACCOUNT=$(aws sts get-caller-identity --query Account --output text) +export REGISTRY=${ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/ +export IMAGE=efa-node-exporter # Docker Image +export TAG=":1.0.0" # Do not specify tag as "latest" +export LOCAL_PORT=9000 # Local port to curl prometheus metrics +``` + +# 2. Build Docker Image + +To build the Docker image: + +```bash +git clone https://github.com/aws-samples/awsome-distributed-training.git +cd awsome-distributed-training/4.validation_and_observability/3.efa-node-exporter/ + +docker build -t ${REGISTRY}${IMAGE}${TAG} -f Dockerfile . +``` + +# 3. Push Docker Image to ECR + +Next, push the Docker image to ECR: + +```bash +echo "Logging in to $REGISTRY ..." +aws ecr get-login-password | docker login --username AWS --password-stdin $REGISTRY + +# Create registry if it does not exist +REGISTRY_COUNT=$(aws ecr describe-repositories | grep ${IMAGE} | wc -l) +if [ "$REGISTRY_COUNT" == "0" ]; then + echo "" + echo "Creating repository ${IMAGE} ..." + aws ecr create-repository --repository-name ${IMAGE} +fi + +# Push image +docker image push ${REGISTRY}${IMAGE}${TAG} +``` + +# 4. Add Helm repo + +Before we can install the helm chart, we need to add the repo like below: + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +``` + +# 5. Install Helm chart + +We have customized the [values.yaml](https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml) in `efa-exporter-values-temp.yaml`. Substitute environment variables to generate `efa-exporter-values.yaml` like below: + +```bash +envsubst < ./efa-exporter-values-temp.yaml > efa-exporter-values.yaml +``` +Next you can install the chart as below: + +```bash +helm install efa-node-exporter -f values.yaml prometheus-community/prometheus-node-exporter +``` +Once done you can see the chart as below: + +```bash +root@cb9511473ccc:/eks/deployment/distributed-training/pytorch/pytorchjob/efa-node-exporter/prometheus-node-exporter# helm list +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +efa-node-exporter default 1 2024-05-31 18:19:31.122892691 +0000 UTC deployed prometheus-node-exporter-4.34.0 1.8.0 +``` + +In addition, you will see efa-node-exporter pods starting up as well like below, one pod per node in the cluster: + +```bash +root@cb9511473ccc:/eks/deployment/distributed-training/pytorch/pytorchjob/efa-node-exporter/prometheus-node-exporter# k get pods | grep 'efa' +efa-node-exporter-prometheus-node-exporter-ctwcf 1/1 Running 0 4d10h +efa-node-exporter-prometheus-node-exporter-r6kvl 1/1 Running 0 4d10h +efa-node-exporter-prometheus-node-exporter-vh2zg 1/1 Running 0 4d10h +``` +Finally, you will also see a new service like below. + +```bash +root@cb9511473ccc:/eks/deployment/distributed-training/pytorch/pytorchjob/efa-node-exporter/prometheus-node-exporter# kubectl get service +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +efa-node-exporter-prometheus-node-exporter ClusterIP 10.100.243.108 9100/TCP 4d10h +``` +Note, the default port is 9100. If you wish to change it, you can do so in the following lines in `efa-exporter-values.yaml`: + +```bash +service: + enabled: true + type: ClusterIP + port: 9100 + targetPort: 9100 + nodePort: + portName: metrics +``` + +# 6. Port-forwarding + +Once the helm chart is installed, you can port forward as below. + +```bash +kubectl port-forward svc/efa-node-exporter-prometheus-node-exporter ${LOCAL_PORT}:9100 +``` + +# 7. Verify + +To verify, open another shell on the same node and try below to see the metrics + +```bash +curl http://127.0.0.1:${LOCAL_PORT}/metrics +``` + +You can grep 'efa' to see something like: + +```bash +root@cb9511473ccc:/eks# curl http://127.0.0.1:${LOCAL_PORT}/metrics | grep "efa" + % Total % Received % Xferd Average Speed Time Time Time Current + Dload Upload Total Spent Left Speed + 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0# HELP node_amazonefa_info Non-numeric data from /sys/class/infiniband/, value is always 1. +# TYPE node_amazonefa_info gauge +node_amazonefa_info{device="rdmap144s27"} 1 +node_amazonefa_info{device="rdmap160s27"} 1 +node_amazonefa_info{device="rdmap16s27"} 1 +node_amazonefa_info{device="rdmap32s27"} 1 +# HELP node_amazonefa_lifespan Lifespan of the port +# TYPE node_amazonefa_lifespan counter +node_amazonefa_lifespan{device="rdmap144s27",port="1"} 12 +node_amazonefa_lifespan{device="rdmap160s27",port="1"} 12 +node_amazonefa_lifespan{device="rdmap16s27",port="1"} 12 +node_amazonefa_lifespan{device="rdmap32s27",port="1"} 12 +# HELP node_amazonefa_rdma_read_bytes Number of bytes read with RDMA +# TYPE node_amazonefa_rdma_read_bytes counter +node_amazonefa_rdma_read_bytes{device="rdmap144s27",port="1"} 1.047241117296e+12 +node_amazonefa_rdma_read_bytes{device="rdmap160s27",port="1"} 1.04201975025e+12 +node_amazonefa_rdma_read_bytes{device="rdmap16s27",port="1"} 1.047241667482e+12 +node_amazonefa_rdma_read_bytes{device="rdmap32s27",port="1"} 1.047241117316e+12 +# HELP node_amazonefa_rdma_read_resp_bytes Number of read reponses bytes with RDMA +# TYPE node_amazonefa_rdma_read_resp_bytes counter +node_amazonefa_rdma_read_resp_bytes{device="rdmap144s27",port="1"} 1.04047386624e+12 +node_amazonefa_rdma_read_resp_bytes{device="rdmap160s27",port="1"} 1.035878016126e+12 +node_amazonefa_rdma_read_resp_bytes{device="rdmap16s27",port="1"} 1.038950461928e+12 +node_amazonefa_rdma_read_resp_bytes{device="rdmap32s27",port="1"} 1.04260614144e+12 +# HELP node_amazonefa_rdma_read_wr_err Number of read write errors with RDMA +# TYPE node_amazonefa_rdma_read_wr_err counter +node_amazonefa_rdma_read_wr_err{device="rdmap144s27",port="1"} 0 +node_amazonefa_rdma_read_wr_err{device="rdmap160s27",port="1"} 0 +node_amazonefa_rdma_read_wr_err{device="rdmap16s27",port="1"} 0 +node_amazonefa_rdma_read_wr_err{device="rdmap32s27",port="1"} 0 +# HELP node_amazonefa_rdma_read_wrs Number of read rs with RDMA +# TYPE node_amazonefa_rdma_read_wrs counter +node_amazonefa_rdma_read_wrs{device="rdmap144s27",port="1"} 3.416044e+06 +node_amazonefa_rdma_read_wrs{device="rdmap160s27",port="1"} 3.238688e+06 +node_amazonefa_rdma_read_wrs{device="rdmap16s27",port="1"} 3.338488e+06 +node_amazonefa_rdma_read_wrs{device="rdmap32s27",port="1"} 3.464577e+06 +# HELP node_amazonefa_rdma_write_bytes Number of bytes wrote with RDMA +# TYPE node_amazonefa_rdma_write_bytes counter +node_amazonefa_rdma_write_bytes{device="rdmap144s27",port="1"} 0 +node_amazonefa_rdma_write_bytes{device="rdmap160s27",port="1"} 0 +node_amazonefa_rdma_write_bytes{device="rdmap16s27",port="1"} 0 +node_amazonefa_rdma_write_bytes{device="rdmap32s27",port="1"} 0 +# HELP node_amazonefa_rdma_write_recv_bytes Number of bytes wrote and received with RDMA +# TYPE node_amazonefa_rdma_write_recv_bytes counter +node_amazonefa_rdma_write_recv_bytes{device="rdmap144s27",port="1"} 0 +node_amazonefa_rdma_write_recv_bytes{device="rdmap160s27",port="1"} 0 +node_amazonefa_rdma_write_recv_bytes{device="rdmap16s27",port="1"} 0 +node_amazonefa_rdma_write_recv_bytes{device="rdmap32s27",port="1"} 0 +# HELP node_amazonefa_rdma_write_wr_err Number of bytes wrote wr with error RDMA +# TYPE node_amazonefa_rdma_write_wr_err counter +node_amazonefa_rdma_write_wr_err{device="rdmap144s27",port="1"} 0 +node_amazonefa_rdma_write_wr_err{device="rdmap160s27",port="1"} 0 +node_amazonefa_rdma_write_wr_err{device="rdmap16s27",port="1"} 0 +node_amazonefa_rdma_write_wr_err{device="rdmap32s27",port="1"} 0 +# HELP node_amazonefa_rdma_write_wrs Number of bytes wrote wrs RDMA +# TYPE node_amazonefa_rdma_write_wrs counter +node_amazonefa_rdma_write_wrs{device="rdmap144s27",port="1"} 0 +node_amazonefa_rdma_write_wrs{device="rdmap160s27",port="1"} 0 +node_amazonefa_rdma_write_wrs{device="rdmap16s27",port="1"} 0 +node_amazonefa_rdma_write_wrs{device="rdmap32s27",port="1"} 0 +# HELP node_amazonefa_recv_bytes Number of bytes recv bytes +# TYPE node_amazonefa_recv_bytes counter +node_amazonefa_recv_bytes{device="rdmap144s27",port="1"} 6.858286312e+09 +node_amazonefa_recv_bytes{device="rdmap160s27",port="1"} 5.331667316e+09 +node_amazonefa_recv_bytes{device="rdmap16s27",port="1"} 6.187744962e+09 +node_amazonefa_recv_bytes{device="rdmap32s27",port="1"} 7.275998544e+09 +# HELP node_amazonefa_recv_wrs Number of bytes recv wrs +# TYPE node_amazonefa_recv_wrs counter +node_amazonefa_recv_wrs{device="rdmap144s27",port="1"} 3.394439e+06 +node_amazonefa_recv_wrs{device="rdmap160s27",port="1"} 3.222012e+06 +node_amazonefa_recv_wrs{device="rdmap16s27",port="1"} 3.319097e+06 +node_amazonefa_recv_wrs{device="rdmap32s27",port="1"} 3.441609e+06 +# HELP node_amazonefa_rx_bytes Number of bytes received +# TYPE node_amazonefa_rx_bytes counter +node_amazonefa_rx_bytes{device="rdmap144s27",port="1"} 1.054099403608e+12 +node_amazonefa_rx_bytes{device="rdmap160s27",port="1"} 1.047351417566e+12 +node_amazonefa_rx_bytes{device="rdmap16s27",port="1"} 1.053429412444e+12 +node_amazonefa_rx_bytes{device="rdmap32s27",port="1"} 1.05451711586e+12 +# HELP node_amazonefa_rx_drops Number of packets droped +# TYPE node_amazonefa_rx_drops counter +node_amazonefa_rx_drops{device="rdmap144s27",port="1"} 0 +node_amazonefa_rx_drops{device="rdmap160s27",port="1"} 0 +node_amazonefa_rx_drops{device="rdmap16s27",port="1"} 0 +node_amazonefa_rx_drops{device="rdmap32s27",port="1"} 0 +# HELP node_amazonefa_rx_pkts Number of packets received +# TYPE node_amazonefa_rx_pkts counter +node_amazonefa_rx_pkts{device="rdmap144s27",port="1"} 6.810483e+06 +node_amazonefa_rx_pkts{device="rdmap160s27",port="1"} 6.4607e+06 +node_amazonefa_rx_pkts{device="rdmap16s27",port="1"} 6.657585e+06 +node_amazonefa_rx_pkts{device="rdmap32s27",port="1"} 6.906186e+06 +# HELP node_amazonefa_send_bytes Number of bytes send +# TYPE node_amazonefa_send_bytes counter +node_amazonefa_send_bytes{device="rdmap144s27",port="1"} 6.92065338e+09 +node_amazonefa_send_bytes{device="rdmap160s27",port="1"} 6.290013412e+09 +node_amazonefa_send_bytes{device="rdmap16s27",port="1"} 8.447687166e+09 +node_amazonefa_send_bytes{device="rdmap32s27",port="1"} 4.77018732e+09 +# HELP node_amazonefa_send_wrs Number of wrs send +# TYPE node_amazonefa_send_wrs counter +node_amazonefa_send_wrs{device="rdmap144s27",port="1"} 3.401962e+06 +node_amazonefa_send_wrs{device="rdmap160s27",port="1"} 3.331132e+06 +node_amazonefa_send_wrs{device="rdmap16s27",port="1"} 3.577494e+06 +node_amazonefa_send_wrs{device="rdmap32s27",port="1"} 3.161853e+06 +# HELP node_amazonefa_tx_bytes Number of bytes transmitted +# TYPE node_amazonefa_tx_bytes counter +node_amazonefa_tx_bytes{device="rdmap144s27",port="1"} 1.04739451962e+12 +node_amazonefa_tx_bytes{device="rdmap160s27",port="1"} 1.042168029538e+12 +node_amazonefa_tx_bytes{device="rdmap16s27",port="1"} 1.047398149094e+12 +node_amazonefa_tx_bytes{device="rdmap32s27",port="1"} 1.04737632876e+12 +# HELP node_amazonefa_tx_pkts Number of packets transmitted +# TYPE node_amazonefa_tx_pkts counter +node_amazonefa_tx_pkts{device="rdmap144s27",port="1"} 1.30916726e+08 +node_amazonefa_tx_pkts{device="rdmap160s27",port="1"} 1.30262069e+08 +node_amazonefa_tx_pkts{device="rdmap16s27",port="1"} 1.30907467e+08 +node_amazonefa_tx_pkts{device="rdmap32s27",port="1"} 1.30933425e+08 +node_scrape_collector_duration_seconds{collector="amazonefa"} 0.016049024 +node_scrape_collector_success{collector="amazonefa"} 1 +100 206k 0 206k 0 0 1266k 0 +``` + +Note, these metrics are counters and when you run an application look for these counters to increase. If for some reason, they are constant, that indicates messages are not sent over EFA. + +# 8. Uninstall Exporter + +To uninstall the exporter, you can do the following which would also stop the relevant pods and service it created + +```bash +helm uninstall efa-node-exporter +``` +Finally, to free up the ${LOCAL_PORT}, you can find the process from below and kill the process to free the port: + +```bash +ps -Aef | grep 'port' +``` diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/ci/port-values.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/ci/port-values.yaml new file mode 100644 index 00000000..dbfb4b67 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/ci/port-values.yaml @@ -0,0 +1,3 @@ +service: + targetPort: 9102 + port: 9102 diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/efa-exporter-values-temp.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/efa-exporter-values-temp.yaml new file mode 100644 index 00000000..34e89822 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/efa-exporter-values-temp.yaml @@ -0,0 +1,532 @@ +# Default values for prometheus-node-exporter. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +image: + registry: ${REPOSITORY} + repository: ${IMAGE} + # Overrides the image tag whose default is {{ printf "v%s" .Chart.AppVersion }} + tag: ${TAG} + pullPolicy: IfNotPresent + digest: "" + +imagePullSecrets: [] +# - name: "image-pull-secret" +nameOverride: "" +fullnameOverride: "" + +# Number of old history to retain to allow rollback +# Default Kubernetes value is set to 10 +revisionHistoryLimit: 10 + +global: + # To help compatibility with other charts which use global.imagePullSecrets. + # Allow either an array of {name: pullSecret} maps (k8s-style), or an array of strings (more common helm-style). + # global: + # imagePullSecrets: + # - name: pullSecret1 + # - name: pullSecret2 + # or + # global: + # imagePullSecrets: + # - pullSecret1 + # - pullSecret2 + imagePullSecrets: [] + # + # Allow parent charts to override registry hostname + imageRegistry: "" + +# Configure kube-rbac-proxy. When enabled, creates a kube-rbac-proxy to protect the node-exporter http endpoint. +# The requests are served through the same service but requests are HTTPS. +kubeRBACProxy: + enabled: false + ## Set environment variables as name/value pairs + env: {} + # VARIABLE: value + image: + registry: quay.io + repository: brancz/kube-rbac-proxy + tag: v0.16.0 + sha: "" + pullPolicy: IfNotPresent + + # List of additional cli arguments to configure kube-rbac-proxy + # for example: --tls-cipher-suites, --log-file, etc. + # all the possible args can be found here: https://github.com/brancz/kube-rbac-proxy#usage + extraArgs: [] + + ## Specify security settings for a Container + ## Allows overrides and additional options compared to (Pod) securityContext + ## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container + containerSecurityContext: {} + + # Specify the port used for the Node exporter container (upstream port) + port: 8100 + # Specify the name of the container port + portName: http + # Configure a hostPort. If true, hostPort will be enabled in the container and set to service.port. + enableHostPort: false + + # Configure Proxy Endpoints Port + # This is the port being probed for readiness + proxyEndpointsPort: 8888 + # Configure a hostPort. If true, hostPort will be enabled in the container and set to proxyEndpointsPort. + enableProxyEndpointsHostPort: false + + resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 64Mi + # requests: + # cpu: 10m + # memory: 32Mi + +service: + enabled: true + type: ClusterIP + port: 9100 + targetPort: 9100 + nodePort: + portName: metrics + listenOnAllInterfaces: true + annotations: + prometheus.io/scrape: "true" + ipDualStack: + enabled: false + ipFamilies: ["IPv6", "IPv4"] + ipFamilyPolicy: "PreferDualStack" + externalTrafficPolicy: "" + +# Set a NetworkPolicy with: +# ingress only on service.port +# no egress permitted +networkPolicy: + enabled: false + +# Additional environment variables that will be passed to the daemonset +env: {} +## env: +## VARIABLE: value + +prometheus: + monitor: + enabled: false + additionalLabels: {} + namespace: "" + + jobLabel: "" + + # List of pod labels to add to node exporter metrics + # https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#servicemonitor + podTargetLabels: [] + + scheme: http + basicAuth: {} + bearerTokenFile: + tlsConfig: {} + + ## proxyUrl: URL of a proxy that should be used for scraping. + ## + proxyUrl: "" + + ## Override serviceMonitor selector + ## + selectorOverride: {} + + ## Attach node metadata to discovered targets. Requires Prometheus v2.35.0 and above. + ## + attachMetadata: + node: false + + relabelings: [] + metricRelabelings: [] + interval: "" + scrapeTimeout: 10s + ## prometheus.monitor.apiVersion ApiVersion for the serviceMonitor Resource(defaults to "monitoring.coreos.com/v1") + apiVersion: "" + + ## SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + ## + sampleLimit: 0 + + ## TargetLimit defines a limit on the number of scraped targets that will be accepted. + ## + targetLimit: 0 + + ## Per-scrape limit on number of labels that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelLimit: 0 + + ## Per-scrape limit on length of labels name that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelNameLengthLimit: 0 + + ## Per-scrape limit on length of labels value that will be accepted for a sample. Only valid in Prometheus versions 2.27.0 and newer. + ## + labelValueLengthLimit: 0 + + # PodMonitor defines monitoring for a set of pods. + # ref. https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#monitoring.coreos.com/v1.PodMonitor + # Using a PodMonitor may be preferred in some environments where there is very large number + # of Node Exporter endpoints (1000+) behind a single service. + # The PodMonitor is disabled by default. When switching from ServiceMonitor to PodMonitor, + # the time series resulting from the configuration through PodMonitor may have different labels. + # For instance, there will not be the service label any longer which might + # affect PromQL queries selecting that label. + podMonitor: + enabled: false + # Namespace in which to deploy the pod monitor. Defaults to the release namespace. + namespace: "" + # Additional labels, e.g. setting a label for pod monitor selector as set in prometheus + additionalLabels: {} + # release: kube-prometheus-stack + # PodTargetLabels transfers labels of the Kubernetes Pod onto the target. + podTargetLabels: [] + # apiVersion defaults to monitoring.coreos.com/v1. + apiVersion: "" + # Override pod selector to select pod objects. + selectorOverride: {} + # Attach node metadata to discovered targets. Requires Prometheus v2.35.0 and above. + attachMetadata: + node: false + # The label to use to retrieve the job name from. Defaults to label app.kubernetes.io/name. + jobLabel: "" + + # Scheme/protocol to use for scraping. + scheme: "http" + # Path to scrape metrics at. + path: "/metrics" + + # BasicAuth allow an endpoint to authenticate over basic authentication. + # More info: https://prometheus.io/docs/operating/configuration/#endpoint + basicAuth: {} + # Secret to mount to read bearer token for scraping targets. + # The secret needs to be in the same namespace as the pod monitor and accessible by the Prometheus Operator. + # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#secretkeyselector-v1-core + bearerTokenSecret: {} + # TLS configuration to use when scraping the endpoint. + tlsConfig: {} + # Authorization section for this endpoint. + # https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#monitoring.coreos.com/v1.SafeAuthorization + authorization: {} + # OAuth2 for the URL. Only valid in Prometheus versions 2.27.0 and newer. + # https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#monitoring.coreos.com/v1.OAuth2 + oauth2: {} + + # ProxyURL eg http://proxyserver:2195. Directs scrapes through proxy to this endpoint. + proxyUrl: "" + # Interval at which endpoints should be scraped. If not specified Prometheus’ global scrape interval is used. + interval: "" + # Timeout after which the scrape is ended. If not specified, the Prometheus global scrape interval is used. + scrapeTimeout: "" + # HonorTimestamps controls whether Prometheus respects the timestamps present in scraped data. + honorTimestamps: true + # HonorLabels chooses the metric’s labels on collisions with target labels. + honorLabels: true + # Whether to enable HTTP2. Default false. + enableHttp2: "" + # Drop pods that are not running. (Failed, Succeeded). + # Enabled by default. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase + filterRunning: "" + # FollowRedirects configures whether scrape requests follow HTTP 3xx redirects. Default false. + followRedirects: "" + # Optional HTTP URL parameters + params: {} + + # RelabelConfigs to apply to samples before scraping. Prometheus Operator automatically adds + # relabelings for a few standard Kubernetes fields. The original scrape job’s name + # is available via the __tmp_prometheus_job_name label. + # More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config + relabelings: [] + # MetricRelabelConfigs to apply to samples before ingestion. + metricRelabelings: [] + + # SampleLimit defines per-scrape limit on number of scraped samples that will be accepted. + sampleLimit: 0 + # TargetLimit defines a limit on the number of scraped targets that will be accepted. + targetLimit: 0 + # Per-scrape limit on number of labels that will be accepted for a sample. + # Only valid in Prometheus versions 2.27.0 and newer. + labelLimit: 0 + # Per-scrape limit on length of labels name that will be accepted for a sample. + # Only valid in Prometheus versions 2.27.0 and newer. + labelNameLengthLimit: 0 + # Per-scrape limit on length of labels value that will be accepted for a sample. + # Only valid in Prometheus versions 2.27.0 and newer. + labelValueLengthLimit: 0 + +## Customize the updateStrategy if set +updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 200m + # memory: 50Mi + # requests: + # cpu: 100m + # memory: 30Mi + +# Specify the container restart policy passed to the Node Export container +# Possible Values: Always (default)|OnFailure|Never +restartPolicy: null + +serviceAccount: + # Specifies whether a ServiceAccount should be created + create: true + # The name of the ServiceAccount to use. + # If not set and create is true, a name is generated using the fullname template + name: + annotations: {} + imagePullSecrets: [] + automountServiceAccountToken: false + +securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + +containerSecurityContext: + readOnlyRootFilesystem: true + # capabilities: + # add: + # - SYS_TIME + +rbac: + ## If true, create & use RBAC resources + ## + create: true + ## If true, create & use Pod Security Policy resources + ## https://kubernetes.io/docs/concepts/policy/pod-security-policy/ + pspEnabled: true + pspAnnotations: {} + +# for deployments that have node_exporter deployed outside of the cluster, list +# their addresses here +endpoints: [] + +# Expose the service to the host network +hostNetwork: true + +# Share the host process ID namespace +hostPID: true + +# Mount the node's root file system (/) at /host/root in the container +hostRootFsMount: + enabled: true + # Defines how new mounts in existing mounts on the node or in the container + # are propagated to the container or node, respectively. Possible values are + # None, HostToContainer, and Bidirectional. If this field is omitted, then + # None is used. More information on: + # https://kubernetes.io/docs/concepts/storage/volumes/#mount-propagation + mountPropagation: HostToContainer + +# Mount the node's proc file system (/proc) at /host/proc in the container +hostProcFsMount: + # Possible values are None, HostToContainer, and Bidirectional + mountPropagation: "" + +# Mount the node's sys file system (/sys) at /host/sys in the container +hostSysFsMount: + # Possible values are None, HostToContainer, and Bidirectional + mountPropagation: "" + +## Assign a group of affinity scheduling rules +## +affinity: {} +# nodeAffinity: +# requiredDuringSchedulingIgnoredDuringExecution: +# nodeSelectorTerms: +# - matchFields: +# - key: metadata.name +# operator: In +# values: +# - target-host-name + +# Annotations to be added to node exporter pods +podAnnotations: + # Fix for very slow GKE cluster upgrades + cluster-autoscaler.kubernetes.io/safe-to-evict: "true" + +# Extra labels to be added to node exporter pods +podLabels: {} + +# Annotations to be added to node exporter daemonset +daemonsetAnnotations: {} + +## set to true to add the release label so scraping of the servicemonitor with kube-prometheus-stack works out of the box +releaseLabel: false + +# Custom DNS configuration to be added to prometheus-node-exporter pods +dnsConfig: {} +# nameservers: +# - 1.2.3.4 +# searches: +# - ns1.svc.cluster-domain.example +# - my.dns.search.suffix +# options: +# - name: ndots +# value: "2" +# - name: edns0 + +## Assign a nodeSelector if operating a hybrid cluster +## +nodeSelector: + kubernetes.io/os: linux + # kubernetes.io/arch: amd64 + +# Specify grace period for graceful termination of pods. Defaults to 30 if null or not specified +terminationGracePeriodSeconds: null + +tolerations: + - effect: NoSchedule + operator: Exists + +# Enable or disable container termination message settings +# https://kubernetes.io/docs/tasks/debug/debug-application/determine-reason-pod-failure/ +terminationMessageParams: + enabled: false + # If enabled, specify the path for termination messages + terminationMessagePath: /dev/termination-log + # If enabled, specify the policy for termination messages + terminationMessagePolicy: File + + +## Assign a PriorityClassName to pods if set +# priorityClassName: "" + +## Additional container arguments +## +extraArgs: [] +# - --collector.diskstats.ignored-devices=^(ram|loop|fd|(h|s|v)d[a-z]|nvme\\d+n\\d+p)\\d+$ +# - --collector.textfile.directory=/run/prometheus + +## Additional mounts from the host to node-exporter container +## +extraHostVolumeMounts: [] +# - name: +# hostPath: +# https://kubernetes.io/docs/concepts/storage/volumes/#hostpath-volume-types +# type: "" (Default)|DirectoryOrCreate|Directory|FileOrCreate|File|Socket|CharDevice|BlockDevice +# mountPath: +# readOnly: true|false +# mountPropagation: None|HostToContainer|Bidirectional + +## Additional configmaps to be mounted. +## +configmaps: [] +# - name: +# mountPath: +secrets: [] +# - name: +# mountPath: +## Override the deployment namespace +## +namespaceOverride: "" + +## Additional containers for export metrics to text file; fields image,imagePullPolicy,securityContext take default value from main container +## +sidecars: [] +# - name: nvidia-dcgm-exporter +# image: nvidia/dcgm-exporter:1.4.3 +# volumeMounts: +# - name: tmp +# mountPath: /tmp + +## Volume for sidecar containers +## +sidecarVolumeMount: [] +# - name: collector-textfiles +# mountPath: /run/prometheus +# readOnly: false + +## Additional mounts from the host to sidecar containers +## +sidecarHostVolumeMounts: [] +# - name: +# hostPath: +# mountPath: +# readOnly: true|false +# mountPropagation: None|HostToContainer|Bidirectional + +## Additional InitContainers to initialize the pod +## +extraInitContainers: [] + +## Liveness probe +## +livenessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: [] + scheme: http + initialDelaySeconds: 0 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + +## Readiness probe +## +readinessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: [] + scheme: http + initialDelaySeconds: 0 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + +# Enable vertical pod autoscaler support for prometheus-node-exporter +verticalPodAutoscaler: + enabled: false + + # Recommender responsible for generating recommendation for the object. + # List should be empty (then the default recommender will generate the recommendation) + # or contain exactly one recommender. + # recommenders: + # - name: custom-recommender-performance + + # List of resources that the vertical pod autoscaler can control. Defaults to cpu and memory + controlledResources: [] + # Specifies which resource values should be controlled: RequestsOnly or RequestsAndLimits. + # controlledValues: RequestsAndLimits + + # Define the max allowed resources for the pod + maxAllowed: {} + # cpu: 200m + # memory: 100Mi + # Define the min allowed resources for the pod + minAllowed: {} + # cpu: 200m + # memory: 100Mi + + # updatePolicy: + # Specifies minimal number of replicas which need to be alive for VPA Updater to attempt pod eviction + # minReplicas: 1 + # Specifies whether recommended updates are applied when a Pod is started and whether recommended updates + # are applied during the life of a Pod. Possible values are "Off", "Initial", "Recreate", and "Auto". + # updateMode: Auto + +# Extra manifests to deploy as an array +extraManifests: [] + # - | + # apiVersion: v1 + # kind: ConfigMap + # metadata: + # name: prometheus-extra + # data: + # extra-data: "value" + +# Override version of app, required if image.tag is defined and does not follow semver +version: "" diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/NOTES.txt b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/NOTES.txt new file mode 100644 index 00000000..db8584de --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/NOTES.txt @@ -0,0 +1,29 @@ +1. Get the application URL by running these commands: +{{- if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ template "prometheus-node-exporter.namespace" . }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "prometheus-node-exporter.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ template "prometheus-node-exporter.namespace" . }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get svc -w {{ template "prometheus-node-exporter.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ template "prometheus-node-exporter.namespace" . }} {{ template "prometheus-node-exporter.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ template "prometheus-node-exporter.namespace" . }} -l "app.kubernetes.io/name={{ template "prometheus-node-exporter.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + echo "Visit http://127.0.0.1:9100 to use your application" + kubectl port-forward --namespace {{ template "prometheus-node-exporter.namespace" . }} $POD_NAME 9100 +{{- end }} + +{{- if .Values.kubeRBACProxy.enabled}} + +kube-rbac-proxy endpoint protections is enabled: +- Metrics endpoints is now HTTPS +- Ensure that the client authenticates the requests (e.g. via service account) with the following role permissions: +``` +rules: + - apiGroups: [ "" ] + resources: ["services/{{ template "prometheus-node-exporter.fullname" . }}"] + verbs: + - get +``` +{{- end }} \ No newline at end of file diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/_helpers.tpl b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/_helpers.tpl new file mode 100644 index 00000000..8e84832c --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/_helpers.tpl @@ -0,0 +1,202 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "prometheus-node-exporter.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "prometheus-node-exporter.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "prometheus-node-exporter.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "prometheus-node-exporter.labels" -}} +helm.sh/chart: {{ include "prometheus-node-exporter.chart" . }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +app.kubernetes.io/component: metrics +app.kubernetes.io/part-of: {{ include "prometheus-node-exporter.name" . }} +{{ include "prometheus-node-exporter.selectorLabels" . }} +{{- with .Chart.AppVersion }} +app.kubernetes.io/version: {{ . | quote }} +{{- end }} +{{- with .Values.podLabels }} +{{ toYaml . }} +{{- end }} +{{- if .Values.releaseLabel }} +release: {{ .Release.Name }} +{{- end }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "prometheus-node-exporter.selectorLabels" -}} +app.kubernetes.io/name: {{ include "prometheus-node-exporter.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + + +{{/* +Create the name of the service account to use +*/}} +{{- define "prometheus-node-exporter.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "prometheus-node-exporter.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +The image to use +*/}} +{{- define "prometheus-node-exporter.image" -}} +{{- if .Values.image.sha }} +{{- fail "image.sha forbidden. Use image.digest instead" }} +{{- else if .Values.image.digest }} +{{- if .Values.global.imageRegistry }} +{{- printf "%s/%s:%s@%s" .Values.global.imageRegistry .Values.image.repository (default (printf "v%s" .Chart.AppVersion) .Values.image.tag) .Values.image.digest }} +{{- else }} +{{- printf "%s/%s:%s@%s" .Values.image.registry .Values.image.repository (default (printf "v%s" .Chart.AppVersion) .Values.image.tag) .Values.image.digest }} +{{- end }} +{{- else }} +{{- if .Values.global.imageRegistry }} +{{- printf "%s/%s:%s" .Values.global.imageRegistry .Values.image.repository (default (printf "v%s" .Chart.AppVersion) .Values.image.tag) }} +{{- else }} +{{- printf "%s/%s:%s" .Values.image.registry .Values.image.repository (default (printf "v%s" .Chart.AppVersion) .Values.image.tag) }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Allow the release namespace to be overridden for multi-namespace deployments in combined charts +*/}} +{{- define "prometheus-node-exporter.namespace" -}} +{{- if .Values.namespaceOverride }} +{{- .Values.namespaceOverride }} +{{- else }} +{{- .Release.Namespace }} +{{- end }} +{{- end }} + +{{/* +Create the namespace name of the service monitor +*/}} +{{- define "prometheus-node-exporter.monitor-namespace" -}} +{{- if .Values.namespaceOverride }} +{{- .Values.namespaceOverride }} +{{- else }} +{{- if .Values.prometheus.monitor.namespace }} +{{- .Values.prometheus.monitor.namespace }} +{{- else }} +{{- .Release.Namespace }} +{{- end }} +{{- end }} +{{- end }} + +{{/* Sets default scrape limits for servicemonitor */}} +{{- define "servicemonitor.scrapeLimits" -}} +{{- with .sampleLimit }} +sampleLimit: {{ . }} +{{- end }} +{{- with .targetLimit }} +targetLimit: {{ . }} +{{- end }} +{{- with .labelLimit }} +labelLimit: {{ . }} +{{- end }} +{{- with .labelNameLengthLimit }} +labelNameLengthLimit: {{ . }} +{{- end }} +{{- with .labelValueLengthLimit }} +labelValueLengthLimit: {{ . }} +{{- end }} +{{- end }} + +{{/* +Formats imagePullSecrets. Input is (dict "Values" .Values "imagePullSecrets" .{specific imagePullSecrets}) +*/}} +{{- define "prometheus-node-exporter.imagePullSecrets" -}} +{{- range (concat .Values.global.imagePullSecrets .imagePullSecrets) }} + {{- if eq (typeOf .) "map[string]interface {}" }} +- {{ toYaml . | trim }} + {{- else }} +- name: {{ . }} + {{- end }} +{{- end }} +{{- end -}} + +{{/* +Create the namespace name of the pod monitor +*/}} +{{- define "prometheus-node-exporter.podmonitor-namespace" -}} +{{- if .Values.namespaceOverride }} +{{- .Values.namespaceOverride }} +{{- else }} +{{- if .Values.prometheus.podMonitor.namespace }} +{{- .Values.prometheus.podMonitor.namespace }} +{{- else }} +{{- .Release.Namespace }} +{{- end }} +{{- end }} +{{- end }} + +{{/* Sets default scrape limits for podmonitor */}} +{{- define "podmonitor.scrapeLimits" -}} +{{- with .sampleLimit }} +sampleLimit: {{ . }} +{{- end }} +{{- with .targetLimit }} +targetLimit: {{ . }} +{{- end }} +{{- with .labelLimit }} +labelLimit: {{ . }} +{{- end }} +{{- with .labelNameLengthLimit }} +labelNameLengthLimit: {{ . }} +{{- end }} +{{- with .labelValueLengthLimit }} +labelValueLengthLimit: {{ . }} +{{- end }} +{{- end }} + +{{/* Sets sidecar volumeMounts */}} +{{- define "prometheus-node-exporter.sidecarVolumeMounts" -}} +{{- range $_, $mount := $.Values.sidecarVolumeMount }} +- name: {{ $mount.name }} + mountPath: {{ $mount.mountPath }} + readOnly: {{ $mount.readOnly }} +{{- end }} +{{- range $_, $mount := $.Values.sidecarHostVolumeMounts }} +- name: {{ $mount.name }} + mountPath: {{ $mount.mountPath }} + readOnly: {{ $mount.readOnly }} +{{- if $mount.mountPropagation }} + mountPropagation: {{ $mount.mountPropagation }} +{{- end }} +{{- end }} +{{- end }} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrole.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrole.yaml new file mode 100644 index 00000000..c256dba7 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrole.yaml @@ -0,0 +1,19 @@ +{{- if and (eq .Values.rbac.create true) (eq .Values.kubeRBACProxy.enabled true) -}} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "prometheus-node-exporter.fullname" . }} + labels: + {{- include "prometheus-node-exporter.labels" . | nindent 4 }} +rules: + {{- if $.Values.kubeRBACProxy.enabled }} + - apiGroups: [ "authentication.k8s.io" ] + resources: + - tokenreviews + verbs: [ "create" ] + - apiGroups: [ "authorization.k8s.io" ] + resources: + - subjectaccessreviews + verbs: [ "create" ] + {{- end }} +{{- end -}} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrolebinding.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrolebinding.yaml new file mode 100644 index 00000000..653305ad --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/clusterrolebinding.yaml @@ -0,0 +1,20 @@ +{{- if and (eq .Values.rbac.create true) (eq .Values.kubeRBACProxy.enabled true) -}} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + {{- include "prometheus-node-exporter.labels" . | nindent 4 }} + name: {{ template "prometheus-node-exporter.fullname" . }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole +{{- if .Values.rbac.useExistingRole }} + name: {{ .Values.rbac.useExistingRole }} +{{- else }} + name: {{ template "prometheus-node-exporter.fullname" . }} +{{- end }} +subjects: +- kind: ServiceAccount + name: {{ template "prometheus-node-exporter.serviceAccountName" . }} + namespace: {{ template "prometheus-node-exporter.namespace" . }} +{{- end -}} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/daemonset.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/daemonset.yaml new file mode 100644 index 00000000..23896a23 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/daemonset.yaml @@ -0,0 +1,311 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "prometheus-node-exporter.fullname" . }} + namespace: {{ include "prometheus-node-exporter.namespace" . }} + labels: + {{- include "prometheus-node-exporter.labels" . | nindent 4 }} + {{- with .Values.daemonsetAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "prometheus-node-exporter.selectorLabels" . | nindent 6 }} + revisionHistoryLimit: {{ .Values.revisionHistoryLimit }} + {{- with .Values.updateStrategy }} + updateStrategy: + {{- toYaml . | nindent 4 }} + {{- end }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "prometheus-node-exporter.labels" . | nindent 8 }} + spec: + automountServiceAccountToken: {{ ternary true false (or .Values.serviceAccount.automountServiceAccountToken .Values.kubeRBACProxy.enabled) }} + {{- with .Values.securityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.extraInitContainers }} + initContainers: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "prometheus-node-exporter.serviceAccountName" . }} + {{- with .Values.terminationGracePeriodSeconds }} + terminationGracePeriodSeconds: {{ . }} + {{- end }} + containers: + {{- $servicePort := ternary .Values.kubeRBACProxy.port .Values.service.port .Values.kubeRBACProxy.enabled }} + - name: node-exporter + image: {{ include "prometheus-node-exporter.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + {{- if .Values.hostRootFsMount.enabled }} + - --path.rootfs=/host/root + {{- if semverCompare ">=1.4.0-0" (coalesce .Values.version .Values.image.tag .Chart.AppVersion) }} + - --path.udev.data=/host/root/run/udev/data + {{- end }} + {{- end }} + - --web.listen-address=[$(HOST_IP)]:{{ $servicePort }} + {{- with .Values.extraArgs }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.containerSecurityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + env: + - name: HOST_IP + {{- if .Values.kubeRBACProxy.enabled }} + value: 127.0.0.1 + {{- else if .Values.service.listenOnAllInterfaces }} + value: 0.0.0.0 + {{- else }} + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.hostIP + {{- end }} + {{- range $key, $value := .Values.env }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- if eq .Values.kubeRBACProxy.enabled false }} + ports: + - name: {{ .Values.service.portName }} + containerPort: {{ .Values.service.port }} + protocol: TCP + {{- end }} + livenessProbe: + failureThreshold: {{ .Values.livenessProbe.failureThreshold }} + httpGet: + {{- if .Values.kubeRBACProxy.enabled }} + host: 127.0.0.1 + {{- end }} + httpHeaders: + {{- range $_, $header := .Values.livenessProbe.httpGet.httpHeaders }} + - name: {{ $header.name }} + value: {{ $header.value }} + {{- end }} + path: / + port: {{ $servicePort }} + scheme: {{ upper .Values.livenessProbe.httpGet.scheme }} + initialDelaySeconds: {{ .Values.livenessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.livenessProbe.periodSeconds }} + successThreshold: {{ .Values.livenessProbe.successThreshold }} + timeoutSeconds: {{ .Values.livenessProbe.timeoutSeconds }} + readinessProbe: + failureThreshold: {{ .Values.readinessProbe.failureThreshold }} + httpGet: + {{- if .Values.kubeRBACProxy.enabled }} + host: 127.0.0.1 + {{- end }} + httpHeaders: + {{- range $_, $header := .Values.readinessProbe.httpGet.httpHeaders }} + - name: {{ $header.name }} + value: {{ $header.value }} + {{- end }} + path: / + port: {{ $servicePort }} + scheme: {{ upper .Values.readinessProbe.httpGet.scheme }} + initialDelaySeconds: {{ .Values.readinessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.readinessProbe.periodSeconds }} + successThreshold: {{ .Values.readinessProbe.successThreshold }} + timeoutSeconds: {{ .Values.readinessProbe.timeoutSeconds }} + {{- with .Values.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- if .Values.terminationMessageParams.enabled }} + {{- with .Values.terminationMessageParams }} + terminationMessagePath: {{ .terminationMessagePath }} + terminationMessagePolicy: {{ .terminationMessagePolicy }} + {{- end }} + {{- end }} + volumeMounts: + - name: proc + mountPath: /host/proc + {{- with .Values.hostProcFsMount.mountPropagation }} + mountPropagation: {{ . }} + {{- end }} + readOnly: true + - name: sys + mountPath: /host/sys + {{- with .Values.hostSysFsMount.mountPropagation }} + mountPropagation: {{ . }} + {{- end }} + readOnly: true + {{- if .Values.hostRootFsMount.enabled }} + - name: root + mountPath: /host/root + {{- with .Values.hostRootFsMount.mountPropagation }} + mountPropagation: {{ . }} + {{- end }} + readOnly: true + {{- end }} + {{- range $_, $mount := .Values.extraHostVolumeMounts }} + - name: {{ $mount.name }} + mountPath: {{ $mount.mountPath }} + readOnly: {{ $mount.readOnly }} + {{- with $mount.mountPropagation }} + mountPropagation: {{ . }} + {{- end }} + {{- end }} + {{- range $_, $mount := .Values.sidecarVolumeMount }} + - name: {{ $mount.name }} + mountPath: {{ $mount.mountPath }} + readOnly: true + {{- end }} + {{- range $_, $mount := .Values.configmaps }} + - name: {{ $mount.name }} + mountPath: {{ $mount.mountPath }} + {{- end }} + {{- range $_, $mount := .Values.secrets }} + - name: {{ .name }} + mountPath: {{ .mountPath }} + {{- end }} + {{- range .Values.sidecars }} + {{- $overwrites := dict "volumeMounts" (concat (include "prometheus-node-exporter.sidecarVolumeMounts" $ | fromYamlArray) (.volumeMounts | default list) | default list) }} + {{- $defaults := dict "image" (include "prometheus-node-exporter.image" $) "securityContext" $.Values.containerSecurityContext "imagePullPolicy" $.Values.image.pullPolicy }} + - {{- toYaml (merge $overwrites . $defaults) | nindent 10 }} + {{- end }} + {{- if .Values.kubeRBACProxy.enabled }} + - name: kube-rbac-proxy + args: + {{- if .Values.kubeRBACProxy.extraArgs }} + {{- .Values.kubeRBACProxy.extraArgs | toYaml | nindent 12 }} + {{- end }} + - --secure-listen-address=:{{ .Values.service.port}} + - --upstream=http://127.0.0.1:{{ $servicePort }}/ + - --proxy-endpoints-port={{ .Values.kubeRBACProxy.proxyEndpointsPort }} + - --config-file=/etc/kube-rbac-proxy-config/config-file.yaml + volumeMounts: + - name: kube-rbac-proxy-config + mountPath: /etc/kube-rbac-proxy-config + imagePullPolicy: {{ .Values.kubeRBACProxy.image.pullPolicy }} + {{- if .Values.kubeRBACProxy.image.sha }} + image: "{{ .Values.global.imageRegistry | default .Values.kubeRBACProxy.image.registry}}/{{ .Values.kubeRBACProxy.image.repository }}:{{ .Values.kubeRBACProxy.image.tag }}@sha256:{{ .Values.kubeRBACProxy.image.sha }}" + {{- else }} + image: "{{ .Values.global.imageRegistry | default .Values.kubeRBACProxy.image.registry}}/{{ .Values.kubeRBACProxy.image.repository }}:{{ .Values.kubeRBACProxy.image.tag }}" + {{- end }} + ports: + - containerPort: {{ .Values.service.port}} + name: {{ .Values.kubeRBACProxy.portName }} + {{- if .Values.kubeRBACProxy.enableHostPort }} + hostPort: {{ .Values.service.port }} + {{- end }} + - containerPort: {{ .Values.kubeRBACProxy.proxyEndpointsPort }} + {{- if .Values.kubeRBACProxy.enableProxyEndpointsHostPort }} + hostPort: {{ .Values.kubeRBACProxy.proxyEndpointsPort }} + {{- end }} + name: "http-healthz" + readinessProbe: + httpGet: + scheme: HTTPS + port: {{ .Values.kubeRBACProxy.proxyEndpointsPort }} + path: healthz + initialDelaySeconds: 5 + timeoutSeconds: 5 + {{- if .Values.kubeRBACProxy.resources }} + resources: + {{- toYaml .Values.kubeRBACProxy.resources | nindent 12 }} + {{- end }} + {{- if .Values.terminationMessageParams.enabled }} + {{- with .Values.terminationMessageParams }} + terminationMessagePath: {{ .terminationMessagePath }} + terminationMessagePolicy: {{ .terminationMessagePolicy }} + {{- end }} + {{- end }} + {{- with .Values.kubeRBACProxy.env }} + env: + {{- range $key, $value := $.Values.kubeRBACProxy.env }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- end }} + {{- if .Values.kubeRBACProxy.containerSecurityContext }} + securityContext: + {{ toYaml .Values.kubeRBACProxy.containerSecurityContext | nindent 12 }} + {{- end }} + {{- end }} + {{- if or .Values.imagePullSecrets .Values.global.imagePullSecrets }} + imagePullSecrets: + {{- include "prometheus-node-exporter.imagePullSecrets" (dict "Values" .Values "imagePullSecrets" .Values.imagePullSecrets) | indent 8 }} + {{- end }} + hostNetwork: {{ .Values.hostNetwork }} + hostPID: {{ .Values.hostPID }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.dnsConfig }} + dnsConfig: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.restartPolicy }} + restartPolicy: {{ . }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + {{- if .Values.hostRootFsMount.enabled }} + - name: root + hostPath: + path: / + {{- end }} + {{- range $_, $mount := .Values.extraHostVolumeMounts }} + - name: {{ $mount.name }} + hostPath: + path: {{ $mount.hostPath }} + {{- with $mount.type }} + type: {{ . }} + {{- end }} + {{- end }} + {{- range $_, $mount := .Values.sidecarVolumeMount }} + - name: {{ $mount.name }} + emptyDir: + medium: Memory + {{- end }} + {{- range $_, $mount := .Values.sidecarHostVolumeMounts }} + - name: {{ $mount.name }} + hostPath: + path: {{ $mount.hostPath }} + {{- end }} + {{- range $_, $mount := .Values.configmaps }} + - name: {{ $mount.name }} + configMap: + name: {{ $mount.name }} + {{- end }} + {{- range $_, $mount := .Values.secrets }} + - name: {{ $mount.name }} + secret: + secretName: {{ $mount.name }} + {{- end }} + {{- if .Values.kubeRBACProxy.enabled }} + - name: kube-rbac-proxy-config + configMap: + name: {{ template "prometheus-node-exporter.fullname" . }}-rbac-config + {{- end }} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/endpoints.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/endpoints.yaml new file mode 100644 index 00000000..45eeb8d9 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/endpoints.yaml @@ -0,0 +1,18 @@ +{{- if .Values.endpoints }} +apiVersion: v1 +kind: Endpoints +metadata: + name: {{ include "prometheus-node-exporter.fullname" . }} + namespace: {{ include "prometheus-node-exporter.namespace" . }} + labels: + {{- include "prometheus-node-exporter.labels" . | nindent 4 }} +subsets: + - addresses: + {{- range .Values.endpoints }} + - ip: {{ . }} + {{- end }} + ports: + - name: {{ .Values.service.portName }} + port: 9100 + protocol: TCP +{{- end }} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/extra-manifests.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/extra-manifests.yaml new file mode 100644 index 00000000..2b21b710 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/extra-manifests.yaml @@ -0,0 +1,4 @@ +{{ range .Values.extraManifests }} +--- +{{ tpl . $ }} +{{ end }} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/networkpolicy.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/networkpolicy.yaml new file mode 100644 index 00000000..82572272 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/networkpolicy.yaml @@ -0,0 +1,23 @@ +{{- if .Values.networkPolicy.enabled }} +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "prometheus-node-exporter.fullname" . }} + namespace: {{ include "prometheus-node-exporter.namespace" . }} + labels: + {{- include "prometheus-node-exporter.labels" $ | nindent 4 }} + {{- with .Values.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + ingress: + - ports: + - port: {{ .Values.service.port }} + policyTypes: + - Egress + - Ingress + podSelector: + matchLabels: + {{- include "prometheus-node-exporter.selectorLabels" . | nindent 6 }} +{{- end }} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/podmonitor.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/podmonitor.yaml new file mode 100644 index 00000000..f88da6a3 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/podmonitor.yaml @@ -0,0 +1,91 @@ +{{- if .Values.prometheus.podMonitor.enabled }} +apiVersion: {{ .Values.prometheus.podMonitor.apiVersion | default "monitoring.coreos.com/v1" }} +kind: PodMonitor +metadata: + name: {{ include "prometheus-node-exporter.fullname" . }} + namespace: {{ include "prometheus-node-exporter.podmonitor-namespace" . }} + labels: + {{- include "prometheus-node-exporter.labels" . | nindent 4 }} + {{- with .Values.prometheus.podMonitor.additionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + jobLabel: {{ default "app.kubernetes.io/name" .Values.prometheus.podMonitor.jobLabel }} + {{- include "podmonitor.scrapeLimits" .Values.prometheus.podMonitor | nindent 2 }} + selector: + matchLabels: + {{- with .Values.prometheus.podMonitor.selectorOverride }} + {{- toYaml . | nindent 6 }} + {{- else }} + {{- include "prometheus-node-exporter.selectorLabels" . | nindent 6 }} + {{- end }} + namespaceSelector: + matchNames: + - {{ include "prometheus-node-exporter.namespace" . }} + {{- with .Values.prometheus.podMonitor.attachMetadata }} + attachMetadata: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.prometheus.podMonitor.podTargetLabels }} + podTargetLabels: + {{- toYaml . | nindent 4 }} + {{- end }} + podMetricsEndpoints: + - port: {{ .Values.service.portName }} + {{- with .Values.prometheus.podMonitor.scheme }} + scheme: {{ . }} + {{- end }} + {{- with .Values.prometheus.podMonitor.path }} + path: {{ . }} + {{- end }} + {{- with .Values.prometheus.podMonitor.basicAuth }} + basicAuth: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.prometheus.podMonitor.bearerTokenSecret }} + bearerTokenSecret: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.prometheus.podMonitor.tlsConfig }} + tlsConfig: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.prometheus.podMonitor.authorization }} + authorization: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.prometheus.podMonitor.oauth2 }} + oauth2: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.prometheus.podMonitor.proxyUrl }} + proxyUrl: {{ . }} + {{- end }} + {{- with .Values.prometheus.podMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.prometheus.podMonitor.honorTimestamps }} + honorTimestamps: {{ . }} + {{- end }} + {{- with .Values.prometheus.podMonitor.honorLabels }} + honorLabels: {{ . }} + {{- end }} + {{- with .Values.prometheus.podMonitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} + {{- with .Values.prometheus.podMonitor.relabelings }} + relabelings: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.prometheus.podMonitor.metricRelabelings }} + metricRelabelings: + {{- toYaml . | nindent 8 }} + {{- end }} + enableHttp2: {{ default false .Values.prometheus.podMonitor.enableHttp2 }} + filterRunning: {{ default true .Values.prometheus.podMonitor.filterRunning }} + followRedirects: {{ default false .Values.prometheus.podMonitor.followRedirects }} + {{- with .Values.prometheus.podMonitor.params }} + params: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrole.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrole.yaml new file mode 100644 index 00000000..89573172 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrole.yaml @@ -0,0 +1,14 @@ +{{- if and .Values.rbac.create .Values.rbac.pspEnabled (.Capabilities.APIVersions.Has "policy/v1beta1/PodSecurityPolicy") }} +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: psp-{{ include "prometheus-node-exporter.fullname" . }} + labels: + {{- include "prometheus-node-exporter.labels" . | nindent 4 }} +rules: +- apiGroups: ['extensions'] + resources: ['podsecuritypolicies'] + verbs: ['use'] + resourceNames: + - {{ include "prometheus-node-exporter.fullname" . }} +{{- end }} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrolebinding.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrolebinding.yaml new file mode 100644 index 00000000..33337017 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp-clusterrolebinding.yaml @@ -0,0 +1,16 @@ +{{- if and .Values.rbac.create .Values.rbac.pspEnabled (.Capabilities.APIVersions.Has "policy/v1beta1/PodSecurityPolicy") }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: psp-{{ include "prometheus-node-exporter.fullname" . }} + labels: + {{- include "prometheus-node-exporter.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: psp-{{ include "prometheus-node-exporter.fullname" . }} +subjects: + - kind: ServiceAccount + name: {{ include "prometheus-node-exporter.fullname" . }} + namespace: {{ include "prometheus-node-exporter.namespace" . }} +{{- end }} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp.yaml new file mode 100644 index 00000000..4896c84d --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/psp.yaml @@ -0,0 +1,49 @@ +{{- if and .Values.rbac.create .Values.rbac.pspEnabled (.Capabilities.APIVersions.Has "policy/v1beta1/PodSecurityPolicy") }} +apiVersion: policy/v1beta1 +kind: PodSecurityPolicy +metadata: + name: {{ include "prometheus-node-exporter.fullname" . }} + namespace: {{ include "prometheus-node-exporter.namespace" . }} + labels: + {{- include "prometheus-node-exporter.labels" . | nindent 4 }} + {{- with .Values.rbac.pspAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + privileged: false + # Allow core volume types. + volumes: + - 'configMap' + - 'emptyDir' + - 'projected' + - 'secret' + - 'downwardAPI' + - 'persistentVolumeClaim' + - 'hostPath' + hostNetwork: true + hostIPC: false + hostPID: true + hostPorts: + - min: 0 + max: 65535 + runAsUser: + # Permits the container to run with root privileges as well. + rule: 'RunAsAny' + seLinux: + # This policy assumes the nodes are using AppArmor rather than SELinux. + rule: 'RunAsAny' + supplementalGroups: + rule: 'MustRunAs' + ranges: + # Allow adding the root group. + - min: 0 + max: 65535 + fsGroup: + rule: 'MustRunAs' + ranges: + # Allow adding the root group. + - min: 0 + max: 65535 + readOnlyRootFilesystem: false +{{- end }} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/rbac-configmap.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/rbac-configmap.yaml new file mode 100644 index 00000000..814e1103 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/rbac-configmap.yaml @@ -0,0 +1,16 @@ +{{- if .Values.kubeRBACProxy.enabled}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ template "prometheus-node-exporter.fullname" . }}-rbac-config + namespace: {{ include "prometheus-node-exporter.namespace" . }} +data: + config-file.yaml: |+ + authorization: + resourceAttributes: + namespace: {{ template "prometheus-node-exporter.namespace" . }} + apiVersion: v1 + resource: services + subresource: {{ template "prometheus-node-exporter.fullname" . }} + name: {{ template "prometheus-node-exporter.fullname" . }} +{{- end }} \ No newline at end of file diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/service.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/service.yaml new file mode 100644 index 00000000..cd3ee55a --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/service.yaml @@ -0,0 +1,32 @@ +{{- if .Values.service.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "prometheus-node-exporter.fullname" . }} + namespace: {{ include "prometheus-node-exporter.namespace" . }} + labels: + {{- include "prometheus-node-exporter.labels" $ | nindent 4 }} + {{- with .Values.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: +{{- if .Values.service.ipDualStack.enabled }} + ipFamilies: {{ toYaml .Values.service.ipDualStack.ipFamilies | nindent 4 }} + ipFamilyPolicy: {{ .Values.service.ipDualStack.ipFamilyPolicy }} +{{- end }} +{{- if .Values.service.externalTrafficPolicy }} + externalTrafficPolicy: {{ .Values.service.externalTrafficPolicy }} +{{- end }} + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + {{- if ( and (eq .Values.service.type "NodePort" ) (not (empty .Values.service.nodePort)) ) }} + nodePort: {{ .Values.service.nodePort }} + {{- end }} + targetPort: {{ .Values.service.targetPort }} + protocol: TCP + name: {{ .Values.service.portName }} + selector: + {{- include "prometheus-node-exporter.selectorLabels" . | nindent 4 }} +{{- end }} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/serviceaccount.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/serviceaccount.yaml new file mode 100644 index 00000000..5c3348c0 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/serviceaccount.yaml @@ -0,0 +1,17 @@ +{{- if and .Values.rbac.create .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "prometheus-node-exporter.serviceAccountName" . }} + namespace: {{ include "prometheus-node-exporter.namespace" . }} + labels: + {{- include "prometheus-node-exporter.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- if or .Values.serviceAccount.imagePullSecrets .Values.global.imagePullSecrets }} +imagePullSecrets: + {{- include "prometheus-node-exporter.imagePullSecrets" (dict "Values" .Values "imagePullSecrets" .Values.serviceAccount.imagePullSecrets) | indent 2 }} +{{- end }} +{{- end -}} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/servicemonitor.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/servicemonitor.yaml new file mode 100644 index 00000000..0d7a42ea --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/servicemonitor.yaml @@ -0,0 +1,61 @@ +{{- if .Values.prometheus.monitor.enabled }} +apiVersion: {{ .Values.prometheus.monitor.apiVersion | default "monitoring.coreos.com/v1" }} +kind: ServiceMonitor +metadata: + name: {{ include "prometheus-node-exporter.fullname" . }} + namespace: {{ include "prometheus-node-exporter.monitor-namespace" . }} + labels: + {{- include "prometheus-node-exporter.labels" . | nindent 4 }} + {{- with .Values.prometheus.monitor.additionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + jobLabel: {{ default "app.kubernetes.io/name" .Values.prometheus.monitor.jobLabel }} + {{- include "servicemonitor.scrapeLimits" .Values.prometheus.monitor | nindent 2 }} + {{- with .Values.prometheus.monitor.podTargetLabels }} + podTargetLabels: + {{- toYaml . | nindent 4 }} + {{- end }} + selector: + matchLabels: + {{- with .Values.prometheus.monitor.selectorOverride }} + {{- toYaml . | nindent 6 }} + {{- else }} + {{- include "prometheus-node-exporter.selectorLabels" . | nindent 6 }} + {{- end }} + {{- with .Values.prometheus.monitor.attachMetadata }} + attachMetadata: + {{- toYaml . | nindent 4 }} + {{- end }} + endpoints: + - port: {{ .Values.service.portName }} + scheme: {{ .Values.prometheus.monitor.scheme }} + {{- with .Values.prometheus.monitor.basicAuth }} + basicAuth: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.prometheus.monitor.bearerTokenFile }} + bearerTokenFile: {{ . }} + {{- end }} + {{- with .Values.prometheus.monitor.tlsConfig }} + tlsConfig: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.prometheus.monitor.proxyUrl }} + proxyUrl: {{ . }} + {{- end }} + {{- with .Values.prometheus.monitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.prometheus.monitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} + {{- with .Values.prometheus.monitor.relabelings }} + relabelings: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.prometheus.monitor.metricRelabelings }} + metricRelabelings: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/4.validation_and_observability/3.efa-node-exporter/EKS/templates/verticalpodautoscaler.yaml b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/verticalpodautoscaler.yaml new file mode 100644 index 00000000..2c2705f8 --- /dev/null +++ b/4.validation_and_observability/3.efa-node-exporter/EKS/templates/verticalpodautoscaler.yaml @@ -0,0 +1,40 @@ +{{- if and (.Capabilities.APIVersions.Has "autoscaling.k8s.io/v1") (.Values.verticalPodAutoscaler.enabled) }} +apiVersion: autoscaling.k8s.io/v1 +kind: VerticalPodAutoscaler +metadata: + name: {{ include "prometheus-node-exporter.fullname" . }} + namespace: {{ include "prometheus-node-exporter.namespace" . }} + labels: + {{- include "prometheus-node-exporter.labels" . | nindent 4 }} +spec: + {{- with .Values.verticalPodAutoscaler.recommenders }} + recommenders: + {{- toYaml . | nindent 4 }} + {{- end }} + resourcePolicy: + containerPolicies: + - containerName: node-exporter + {{- with .Values.verticalPodAutoscaler.controlledResources }} + controlledResources: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.verticalPodAutoscaler.controlledValues }} + controlledValues: {{ . }} + {{- end }} + {{- with .Values.verticalPodAutoscaler.maxAllowed }} + maxAllowed: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.verticalPodAutoscaler.minAllowed }} + minAllowed: + {{- toYaml . | nindent 8 }} + {{- end }} + targetRef: + apiVersion: apps/v1 + kind: DaemonSet + name: {{ include "prometheus-node-exporter.fullname" . }} + {{- with .Values.verticalPodAutoscaler.updatePolicy }} + updatePolicy: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }}