From 3ee712027d45e628a94f80f14e3fefadb78d77d7 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Tue, 25 Jun 2024 10:14:08 -0700 Subject: [PATCH] Helm chart updates related to Prometheus, Webhook HPA, and Flyteconsole probes (#5508) --- charts/flyte-core/README.md | 26 +++++++++- charts/flyte-core/templates/_helpers.tpl | 1 - .../clusterresourcesync/deployment.yaml | 11 +++++ .../templates/console/deployment.yaml | 13 +++++ .../templates/console/service-monitor.yaml | 19 +++++++ .../flyte-core/templates/console/service.yaml | 5 ++ .../templates/propeller/deployment.yaml | 10 ++++ .../templates/propeller/webhook-hpa.yaml | 17 +++++++ .../templates/propeller/webhook.yaml | 10 ++++ charts/flyte-core/values.yaml | 49 +++++++++++++++++++ .../flyte_aws_scheduler_helm_generated.yaml | 6 +++ .../flyte_helm_controlplane_generated.yaml | 2 + .../eks/flyte_helm_dataplane_generated.yaml | 4 ++ deployment/eks/flyte_helm_generated.yaml | 6 +++ .../flyte_helm_controlplane_generated.yaml | 2 + .../gcp/flyte_helm_dataplane_generated.yaml | 4 ++ deployment/gcp/flyte_helm_generated.yaml | 6 +++ deployment/sandbox/flyte_helm_generated.yaml | 6 +++ .../manifests/complete-agent.yaml | 4 +- .../sandbox-bundled/manifests/complete.yaml | 4 +- docker/sandbox-bundled/manifests/dev.yaml | 4 +- flyteadmin/cmd/entrypoints/clusterresource.go | 12 +++++ flyteadmin/pkg/clusterresource/controller.go | 5 +- 23 files changed, 217 insertions(+), 9 deletions(-) create mode 100644 charts/flyte-core/templates/console/service-monitor.yaml create mode 100644 charts/flyte-core/templates/propeller/webhook-hpa.yaml diff --git a/charts/flyte-core/README.md b/charts/flyte-core/README.md index 4662eeef66..3703707660 100644 --- a/charts/flyte-core/README.md +++ b/charts/flyte-core/README.md @@ -60,7 +60,7 @@ helm install gateway bitnami/contour -n flyte | cloud_events.eventsPublisher.eventTypes[0] | string | `"all"` | | | cloud_events.eventsPublisher.topicName | string | `"arn:aws:sns:us-east-2:123456:123-my-topic"` | | | cloud_events.type | string | `"aws"` | | -| cluster_resource_manager | object | `{"config":{"cluster_resources":{"customData":[{"production":[{"projectQuotaCpu":{"value":"5"}},{"projectQuotaMemory":{"value":"4000Mi"}}]},{"staging":[{"projectQuotaCpu":{"value":"2"}},{"projectQuotaMemory":{"value":"3000Mi"}}]},{"development":[{"projectQuotaCpu":{"value":"4"}},{"projectQuotaMemory":{"value":"3000Mi"}}]}],"refreshInterval":"5m","standaloneDeployment":false,"templatePath":"/etc/flyte/clusterresource/templates"}},"enabled":true,"nodeSelector":{},"podAnnotations":{},"podEnv":{},"podLabels":{},"resources":{},"service_account_name":"flyteadmin","standaloneDeployment":false,"templates":[{"key":"aa_namespace","value":"apiVersion: v1\nkind: Namespace\nmetadata:\n name: {{ namespace }}\nspec:\n finalizers:\n - kubernetes\n"},{"key":"ab_project_resource_quota","value":"apiVersion: v1\nkind: ResourceQuota\nmetadata:\n name: project-quota\n namespace: {{ namespace }}\nspec:\n hard:\n limits.cpu: {{ projectQuotaCpu }}\n limits.memory: {{ projectQuotaMemory }}\n"}]}` | Configuration for the Cluster resource manager component. This is an optional component, that enables automatic cluster configuration. This is useful to set default quotas, manage namespaces etc that map to a project/domain | +| cluster_resource_manager | object | `{"config":{"cluster_resources":{"customData":[{"production":[{"projectQuotaCpu":{"value":"5"}},{"projectQuotaMemory":{"value":"4000Mi"}}]},{"staging":[{"projectQuotaCpu":{"value":"2"}},{"projectQuotaMemory":{"value":"3000Mi"}}]},{"development":[{"projectQuotaCpu":{"value":"4"}},{"projectQuotaMemory":{"value":"3000Mi"}}]}],"refreshInterval":"5m","standaloneDeployment":false,"templatePath":"/etc/flyte/clusterresource/templates"}},"enabled":true,"nodeSelector":{},"podAnnotations":{},"podEnv":{},"podLabels":{},"prometheus":{"enabled":false,"path":"/metrics","port":10254},"resources":{},"service_account_name":"flyteadmin","standaloneDeployment":false,"templates":[{"key":"aa_namespace","value":"apiVersion: v1\nkind: Namespace\nmetadata:\n name: {{ namespace }}\nspec:\n finalizers:\n - kubernetes\n"},{"key":"ab_project_resource_quota","value":"apiVersion: v1\nkind: ResourceQuota\nmetadata:\n name: project-quota\n namespace: {{ namespace }}\nspec:\n hard:\n limits.cpu: {{ projectQuotaCpu }}\n limits.memory: {{ projectQuotaMemory }}\n"}]}` | Configuration for the Cluster resource manager component. This is an optional component, that enables automatic cluster configuration. This is useful to set default quotas, manage namespaces etc that map to a project/domain | | cluster_resource_manager.config | object | `{"cluster_resources":{"customData":[{"production":[{"projectQuotaCpu":{"value":"5"}},{"projectQuotaMemory":{"value":"4000Mi"}}]},{"staging":[{"projectQuotaCpu":{"value":"2"}},{"projectQuotaMemory":{"value":"3000Mi"}}]},{"development":[{"projectQuotaCpu":{"value":"4"}},{"projectQuotaMemory":{"value":"3000Mi"}}]}],"refreshInterval":"5m","standaloneDeployment":false,"templatePath":"/etc/flyte/clusterresource/templates"}}` | Configmap for ClusterResource parameters | | cluster_resource_manager.config.cluster_resources | object | `{"customData":[{"production":[{"projectQuotaCpu":{"value":"5"}},{"projectQuotaMemory":{"value":"4000Mi"}}]},{"staging":[{"projectQuotaCpu":{"value":"2"}},{"projectQuotaMemory":{"value":"3000Mi"}}]},{"development":[{"projectQuotaCpu":{"value":"4"}},{"projectQuotaMemory":{"value":"3000Mi"}}]}],"refreshInterval":"5m","standaloneDeployment":false,"templatePath":"/etc/flyte/clusterresource/templates"}` | ClusterResource parameters Refer to the [structure](https://pkg.go.dev/github.com/lyft/flyteadmin@v0.3.37/pkg/runtime/interfaces#ClusterResourceConfig) to customize. | | cluster_resource_manager.config.cluster_resources.refreshInterval | string | `"5m"` | How frequently to run the sync process | @@ -209,15 +209,22 @@ helm install gateway bitnami/contour -n flyte | flyteconsole.image.repository | string | `"cr.flyte.org/flyteorg/flyteconsole"` | Docker image for Flyteconsole deployment | | flyteconsole.image.tag | string | `"v1.14.0"` | | | flyteconsole.imagePullSecrets | list | `[]` | ImagePullSecrets to assign to the Flyteconsole deployment | +| flyteconsole.livenessProbe | object | `{}` | | | flyteconsole.nodeSelector | object | `{}` | nodeSelector for Flyteconsole deployment | | flyteconsole.podAnnotations | object | `{}` | Annotations for Flyteconsole pods | | flyteconsole.podEnv | object | `{}` | Additional Flyteconsole container environment variables | | flyteconsole.podLabels | object | `{}` | Labels for Flyteconsole pods | | flyteconsole.priorityClassName | string | `""` | Sets priorityClassName for flyte console pod(s). | +| flyteconsole.readinessProbe | object | `{}` | | | flyteconsole.replicaCount | int | `1` | Replicas count for Flyteconsole deployment | | flyteconsole.resources | object | `{"limits":{"cpu":"500m","memory":"250Mi"},"requests":{"cpu":"10m","memory":"50Mi"}}` | Default resources requests and limits for Flyteconsole deployment | | flyteconsole.securityContext | object | `{"fsGroupChangePolicy":"OnRootMismatch","runAsNonRoot":true,"runAsUser":1000,"seLinuxOptions":{"type":"spc_t"}}` | Sets securityContext for flyteconsole pod(s). | | flyteconsole.service | object | `{"annotations":{},"type":"ClusterIP"}` | Service settings for Flyteconsole | +| flyteconsole.serviceMonitor | object | `{"enabled":false,"interval":"60s","labels":{},"scrapeTimeout":"30s"}` | Settings for flyteconsole service monitor | +| flyteconsole.serviceMonitor.enabled | bool | `false` | If enabled create the flyteconsole service monitor | +| flyteconsole.serviceMonitor.interval | string | `"60s"` | Sets the interval at which metrics will be scraped by prometheus | +| flyteconsole.serviceMonitor.labels | object | `{}` | Sets the labels for the service monitor which are required by the prometheus to auto-detect the service monitor and start scrapping the metrics | +| flyteconsole.serviceMonitor.scrapeTimeout | string | `"30s"` | Sets the timeout after which request to scrape metrics will time out | | flyteconsole.tolerations | list | `[]` | tolerations for Flyteconsole deployment | | flytepropeller.additionalContainers | list | `[]` | Appends additional containers to the deployment spec. May include template values. | | flytepropeller.additionalVolumeMounts | list | `[]` | Appends additional volume mounts to the main container's spec. May include template values. | @@ -238,6 +245,9 @@ helm install gateway bitnami/contour -n flyte | flytepropeller.podEnv | object | `{}` | Additional Flytepropeller container environment variables | | flytepropeller.podLabels | object | `{}` | Labels for Flytepropeller pods | | flytepropeller.priorityClassName | string | `""` | Sets priorityClassName for propeller pod(s). | +| flytepropeller.prometheus.enabled | bool | `false` | | +| flytepropeller.prometheus.path | string | `"/metrics"` | | +| flytepropeller.prometheus.port | int | `10254` | | | flytepropeller.replicaCount | int | `1` | Replicas count for Flytepropeller deployment | | flytepropeller.resources | object | `{"limits":{"cpu":"200m","ephemeral-storage":"100Mi","memory":"200Mi"},"requests":{"cpu":"10m","ephemeral-storage":"50Mi","memory":"100Mi"}}` | Default resources requests and limits for Flytepropeller deployment | | flytepropeller.securityContext | object | `{"fsGroup":65534,"fsGroupChangePolicy":"Always","runAsUser":1001}` | Sets securityContext for flytepropeller pod(s). | @@ -295,8 +305,22 @@ helm install gateway bitnami/contour -n flyte | storage.s3.authType | string | `"iam"` | type of authentication to use for S3 buckets, can either be iam or accesskey | | storage.s3.secretKey | string | `""` | AWS IAM user secret access key to use for S3 bucket auth, only used if authType is set to accesskey | | storage.type | string | `"sandbox"` | Sets the storage type. Supported values are sandbox, s3, gcs and custom. | +| webhook.autoscaling.enabled | bool | `false` | | +| webhook.autoscaling.maxReplicas | int | `10` | | +| webhook.autoscaling.metrics[0].resource.name | string | `"cpu"` | | +| webhook.autoscaling.metrics[0].resource.target.averageUtilization | int | `80` | | +| webhook.autoscaling.metrics[0].resource.target.type | string | `"Utilization"` | | +| webhook.autoscaling.metrics[0].type | string | `"Resource"` | | +| webhook.autoscaling.metrics[1].resource.name | string | `"memory"` | | +| webhook.autoscaling.metrics[1].resource.target.averageUtilization | int | `80` | | +| webhook.autoscaling.metrics[1].resource.target.type | string | `"Utilization"` | | +| webhook.autoscaling.metrics[1].type | string | `"Resource"` | | +| webhook.autoscaling.minReplicas | int | `1` | | | webhook.enabled | bool | `true` | enable or disable secrets webhook | | webhook.priorityClassName | string | `""` | Sets priorityClassName for webhook pod | +| webhook.prometheus.enabled | bool | `false` | | +| webhook.prometheus.path | string | `"/metrics"` | | +| webhook.prometheus.port | int | `10254` | | | webhook.resources.requests.cpu | string | `"200m"` | | | webhook.resources.requests.ephemeral-storage | string | `"500Mi"` | | | webhook.resources.requests.memory | string | `"500Mi"` | | diff --git a/charts/flyte-core/templates/_helpers.tpl b/charts/flyte-core/templates/_helpers.tpl index b4361a1e47..f7b50c0b29 100755 --- a/charts/flyte-core/templates/_helpers.tpl +++ b/charts/flyte-core/templates/_helpers.tpl @@ -79,7 +79,6 @@ app.kubernetes.io/managed-by: {{ .Release.Service }} {{ toYaml . }} {{- end }} {{- end -}} - {{- define "datacatalog.name" -}} datacatalog {{- end -}} diff --git a/charts/flyte-core/templates/clusterresourcesync/deployment.yaml b/charts/flyte-core/templates/clusterresourcesync/deployment.yaml index a2fb5d04ae..9108a0b335 100644 --- a/charts/flyte-core/templates/clusterresourcesync/deployment.yaml +++ b/charts/flyte-core/templates/clusterresourcesync/deployment.yaml @@ -16,6 +16,11 @@ spec: {{- with .Values.cluster_resource_manager.podAnnotations }} {{- toYaml . | nindent 8 }} {{- end }} + prometheus.io/path: {{ .Values.cluster_resource_manager.prometheus.path | quote }} + prometheus.io/port: {{ .Values.cluster_resource_manager.prometheus.port | quote }} + {{- with .Values.cluster_resource_manager.prometheus.enabled }} + prometheus.io/scrape: "true" + {{- end }} labels: {{ include "flyteclusterresourcesync.podLabels" . | nindent 8 }} spec: containers: @@ -55,6 +60,12 @@ spec: - mountPath: /var/run/credentials name: cluster-secrets {{- end }} + {{- if .Values.cluster_resource_manager.prometheus.enabled }} + ports: + - containerPort: {{ .Values.cluster_resource_manager.prometheus.port }} + name: debug + protocol: TCP + {{- end }} serviceAccountName: {{ .Values.cluster_resource_manager.service_account_name }} volumes: {{- include "databaseSecret.volume" . | nindent 8 }} - configMap: diff --git a/charts/flyte-core/templates/console/deployment.yaml b/charts/flyte-core/templates/console/deployment.yaml index 2d89e0265a..834b3cf84c 100644 --- a/charts/flyte-core/templates/console/deployment.yaml +++ b/charts/flyte-core/templates/console/deployment.yaml @@ -37,6 +37,11 @@ spec: name: flyte-console-config ports: - containerPort: 8080 + {{- if .Values.flyteconsole.serviceMonitor.enabled }} + - containerPort: 8081 + name: http-metrics + protocol: TCP + {{- end }} {{- if or .Values.flyteconsole.ga.enabled .Values.flyteconsole.podEnv }} env: {{- end }} @@ -59,6 +64,14 @@ spec: volumeMounts: - mountPath: /srv/flyte name: shared-data + {{- with .Values.flyteconsole.livenessProbe }} + livenessProbe: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- with .Values.flyteconsole.readinessProbe }} + readinessProbe: + {{- toYaml . | nindent 10 }} + {{- end }} volumes: - emptyDir: {} name: shared-data diff --git a/charts/flyte-core/templates/console/service-monitor.yaml b/charts/flyte-core/templates/console/service-monitor.yaml new file mode 100644 index 0000000000..84c9b21752 --- /dev/null +++ b/charts/flyte-core/templates/console/service-monitor.yaml @@ -0,0 +1,19 @@ +{{- if and .Values.flyteconsole.serviceMonitor.enabled .Values.flyteconsole.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ template "flyteconsole.name" . }} + namespace: {{ template "flyte.namespace" . }} + labels: + {{- with .Values.flyteconsole.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + endpoints: + - interval: {{ .Values.flyteconsole.serviceMonitor.interval }} + port: http-metrics + path: /metrics + scrapeTimeout: {{ .Values.flyteconsole.serviceMonitor.scrapeTimeout }} + selector: + matchLabels: {{ include "flyteconsole.selectorLabels" . | nindent 6 }} +{{- end }} diff --git a/charts/flyte-core/templates/console/service.yaml b/charts/flyte-core/templates/console/service.yaml index b907b90960..1845294cef 100644 --- a/charts/flyte-core/templates/console/service.yaml +++ b/charts/flyte-core/templates/console/service.yaml @@ -17,5 +17,10 @@ spec: port: 80 protocol: TCP targetPort: 8080 + {{- if .Values.flyteconsole.serviceMonitor.enabled }} + - name: http-metrics + port: 8081 + protocol: TCP + {{- end }} selector: {{ include "flyteconsole.selectorLabels" . | nindent 4 }} {{- end }} diff --git a/charts/flyte-core/templates/propeller/deployment.yaml b/charts/flyte-core/templates/propeller/deployment.yaml index 5fd09e5d5d..9488dbf8a5 100644 --- a/charts/flyte-core/templates/propeller/deployment.yaml +++ b/charts/flyte-core/templates/propeller/deployment.yaml @@ -25,6 +25,11 @@ spec: {{- with .Values.flytepropeller.podAnnotations }} {{- toYaml . | nindent 8 }} {{- end }} + prometheus.io/path: {{ .Values.flytepropeller.prometheus.path | quote }} + prometheus.io/port: {{ .Values.flytepropeller.prometheus.port | quote }} + {{- with .Values.flytepropeller.prometheus.enabled }} + prometheus.io/scrape: "true" + {{- end }} {{- if .Values.flytepropeller.manager }} labels: {{ include "flytepropeller-manager.podLabels" . | nindent 8 }} {{- else }} @@ -78,6 +83,11 @@ spec: {{- end }} ports: - containerPort: {{ index .Values.configmap.core.propeller "prof-port" }} + {{- if .Values.flytepropeller.prometheus.enabled }} + - containerPort: {{ .Values.flytepropeller.prometheus.port }} + name: debug + protocol: TCP + {{- end }} resources: {{- toYaml .Values.flytepropeller.resources | nindent 10 }} volumeMounts: - name: config-volume diff --git a/charts/flyte-core/templates/propeller/webhook-hpa.yaml b/charts/flyte-core/templates/propeller/webhook-hpa.yaml new file mode 100644 index 0000000000..9562287fe7 --- /dev/null +++ b/charts/flyte-core/templates/propeller/webhook-hpa.yaml @@ -0,0 +1,17 @@ +{{- if .Values.webhook.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ template "flyte-pod-webhook.name" . }} + labels: + app: {{ template "flyte-pod-webhook.name" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ template "flyte-pod-webhook.name" . }} + minReplicas: {{ .Values.webhook.autoscaling.minReplicas }} + maxReplicas: {{ .Values.webhook.autoscaling.maxReplicas }} + metrics: + {{ .Values.webhook.autoscaling.metrics | toYaml | nindent 4 }} +{{- end }} diff --git a/charts/flyte-core/templates/propeller/webhook.yaml b/charts/flyte-core/templates/propeller/webhook.yaml index 90241a69f8..89757eff7c 100644 --- a/charts/flyte-core/templates/propeller/webhook.yaml +++ b/charts/flyte-core/templates/propeller/webhook.yaml @@ -34,6 +34,11 @@ spec: {{- with .Values.flytepropeller.podAnnotations }} {{- toYaml . | nindent 8 }} {{- end }} + prometheus.io/path: {{ .Values.webhook.prometheus.path | quote }} + prometheus.io/port: {{ .Values.webhook.prometheus.port | quote }} + {{- with .Values.webhook.prometheus.enabled }} + prometheus.io/scrape: "true" + {{- end }} spec: {{- with .Values.webhook.securityContext }} securityContext: {{ tpl (toYaml .) $ | nindent 8 }} @@ -102,6 +107,11 @@ spec: {{- end }} ports: - containerPort: 9443 + {{- if .Values.webhook.prometheus.enabled }} + - containerPort: {{ .Values.webhook.prometheus.port }} + name: debug + protocol: TCP + {{- end }} securityContext: allowPrivilegeEscalation: false capabilities: diff --git a/charts/flyte-core/values.yaml b/charts/flyte-core/values.yaml index b7f390a45e..3f2932ad4b 100755 --- a/charts/flyte-core/values.yaml +++ b/charts/flyte-core/values.yaml @@ -383,6 +383,12 @@ flytepropeller: interval: 60s # -- Sets the timeout after which request to scrape metrics will time out scrapeTimeout: 30s + + prometheus: + enabled: false + path: "/metrics" + port: 10254 + # # FLYTECONSOLE SETTINGS # @@ -436,6 +442,21 @@ flyteconsole: seLinuxOptions: type: spc_t + # -- Settings for flyteconsole service monitor + serviceMonitor: + # -- If enabled create the flyteconsole service monitor + enabled: false + # -- Sets the interval at which metrics will be scraped by prometheus + interval: 60s + # -- Sets the timeout after which request to scrape metrics will time out + scrapeTimeout: 30s + # -- Sets the labels for the service monitor which are required by the + # prometheus to auto-detect the service monitor and start scrapping the metrics + labels: {} + + livenessProbe: {} + readinessProbe: {} + # It will enable the redoc route in ingress deployRedoc: false @@ -492,6 +513,29 @@ webhook: ephemeral-storage: 500Mi memory: 500Mi + autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 + + prometheus: + enabled: false + path: "/metrics" + port: 10254 + # ------------------------------------------------ # # COMMON SETTINGS @@ -951,6 +995,11 @@ cluster_resource_manager: - projectQuotaMemory: value: "3000Mi" + prometheus: + enabled: false + path: "/metrics" + port: 10254 + # -- Resource templates that should be applied templates: # -- Template for namespaces resources diff --git a/deployment/eks/flyte_aws_scheduler_helm_generated.yaml b/deployment/eks/flyte_aws_scheduler_helm_generated.yaml index fadb80927f..2bf9fa2cfe 100644 --- a/deployment/eks/flyte_aws_scheduler_helm_generated.yaml +++ b/deployment/eks/flyte_aws_scheduler_helm_generated.yaml @@ -1045,6 +1045,8 @@ spec: metadata: annotations: configChecksum: "55ce597c10b17ef6e891f0c9242b17aafb3d7b4e4e414d0a5078d71ad9c804f" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flyteclusterresourcesync app.kubernetes.io/instance: flyte @@ -1270,6 +1272,8 @@ spec: metadata: annotations: configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flytepropeller app.kubernetes.io/instance: flyte @@ -1352,6 +1356,8 @@ spec: app.kubernetes.io/version: v1.12.1-rc0 annotations: configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" spec: securityContext: fsGroup: 65534 diff --git a/deployment/eks/flyte_helm_controlplane_generated.yaml b/deployment/eks/flyte_helm_controlplane_generated.yaml index f110eda151..982ed24d03 100644 --- a/deployment/eks/flyte_helm_controlplane_generated.yaml +++ b/deployment/eks/flyte_helm_controlplane_generated.yaml @@ -750,6 +750,8 @@ spec: metadata: annotations: configChecksum: "55ce597c10b17ef6e891f0c9242b17aafb3d7b4e4e414d0a5078d71ad9c804f" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flyteclusterresourcesync app.kubernetes.io/instance: flyte diff --git a/deployment/eks/flyte_helm_dataplane_generated.yaml b/deployment/eks/flyte_helm_dataplane_generated.yaml index f7b146dd9f..8d17d71279 100644 --- a/deployment/eks/flyte_helm_dataplane_generated.yaml +++ b/deployment/eks/flyte_helm_dataplane_generated.yaml @@ -429,6 +429,8 @@ spec: metadata: annotations: configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flytepropeller app.kubernetes.io/instance: flyte @@ -511,6 +513,8 @@ spec: app.kubernetes.io/version: v1.12.1-rc0 annotations: configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" spec: securityContext: fsGroup: 65534 diff --git a/deployment/eks/flyte_helm_generated.yaml b/deployment/eks/flyte_helm_generated.yaml index 4aba6e7f3d..9b2ce0aea3 100644 --- a/deployment/eks/flyte_helm_generated.yaml +++ b/deployment/eks/flyte_helm_generated.yaml @@ -1076,6 +1076,8 @@ spec: metadata: annotations: configChecksum: "55ce597c10b17ef6e891f0c9242b17aafb3d7b4e4e414d0a5078d71ad9c804f" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flyteclusterresourcesync app.kubernetes.io/instance: flyte @@ -1400,6 +1402,8 @@ spec: metadata: annotations: configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flytepropeller app.kubernetes.io/instance: flyte @@ -1482,6 +1486,8 @@ spec: app.kubernetes.io/version: v1.12.1-rc0 annotations: configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" spec: securityContext: fsGroup: 65534 diff --git a/deployment/gcp/flyte_helm_controlplane_generated.yaml b/deployment/gcp/flyte_helm_controlplane_generated.yaml index b227a9c0d0..02c6ef5edf 100644 --- a/deployment/gcp/flyte_helm_controlplane_generated.yaml +++ b/deployment/gcp/flyte_helm_controlplane_generated.yaml @@ -765,6 +765,8 @@ spec: metadata: annotations: configChecksum: "dc18f5d54e0770c574e6b0693724047e22063030259104eebb554398d63209f" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flyteclusterresourcesync app.kubernetes.io/instance: flyte diff --git a/deployment/gcp/flyte_helm_dataplane_generated.yaml b/deployment/gcp/flyte_helm_dataplane_generated.yaml index 0da528443f..e89bde15bf 100644 --- a/deployment/gcp/flyte_helm_dataplane_generated.yaml +++ b/deployment/gcp/flyte_helm_dataplane_generated.yaml @@ -437,6 +437,8 @@ spec: metadata: annotations: configChecksum: "2a962c9fcb8a58e835ea829883300ae11e9124b9972c5e1fe29e1cc283dd2f9" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flytepropeller app.kubernetes.io/instance: flyte @@ -518,6 +520,8 @@ spec: app.kubernetes.io/version: v1.12.1-rc0 annotations: configChecksum: "2a962c9fcb8a58e835ea829883300ae11e9124b9972c5e1fe29e1cc283dd2f9" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" spec: securityContext: fsGroup: 65534 diff --git a/deployment/gcp/flyte_helm_generated.yaml b/deployment/gcp/flyte_helm_generated.yaml index 6df3f31225..5fe9dcdaaa 100644 --- a/deployment/gcp/flyte_helm_generated.yaml +++ b/deployment/gcp/flyte_helm_generated.yaml @@ -1099,6 +1099,8 @@ spec: metadata: annotations: configChecksum: "dc18f5d54e0770c574e6b0693724047e22063030259104eebb554398d63209f" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flyteclusterresourcesync app.kubernetes.io/instance: flyte @@ -1423,6 +1425,8 @@ spec: metadata: annotations: configChecksum: "2a962c9fcb8a58e835ea829883300ae11e9124b9972c5e1fe29e1cc283dd2f9" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flytepropeller app.kubernetes.io/instance: flyte @@ -1504,6 +1508,8 @@ spec: app.kubernetes.io/version: v1.12.1-rc0 annotations: configChecksum: "2a962c9fcb8a58e835ea829883300ae11e9124b9972c5e1fe29e1cc283dd2f9" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" spec: securityContext: fsGroup: 65534 diff --git a/deployment/sandbox/flyte_helm_generated.yaml b/deployment/sandbox/flyte_helm_generated.yaml index f10c4cfdb3..c984ddac6c 100644 --- a/deployment/sandbox/flyte_helm_generated.yaml +++ b/deployment/sandbox/flyte_helm_generated.yaml @@ -6870,6 +6870,8 @@ spec: metadata: annotations: configChecksum: "475154c41cdb06999025ab796aa1264fa3d235df51ac088a39c89c7ce300408" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flyteclusterresourcesync app.kubernetes.io/instance: flyte @@ -7174,6 +7176,8 @@ spec: metadata: annotations: configChecksum: "f892b909c52752746c1b17c780ae5733f70d8c731acc9a89c31361c5690c8a5" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" labels: app.kubernetes.io/name: flytepropeller app.kubernetes.io/instance: flyte @@ -7248,6 +7252,8 @@ spec: app.kubernetes.io/version: v1.12.1-rc0 annotations: configChecksum: "f892b909c52752746c1b17c780ae5733f70d8c731acc9a89c31361c5690c8a5" + prometheus.io/path: "/metrics" + prometheus.io/port: "10254" spec: securityContext: fsGroup: 65534 diff --git a/docker/sandbox-bundled/manifests/complete-agent.yaml b/docker/sandbox-bundled/manifests/complete-agent.yaml index 0a18e42b33..f261cb2623 100644 --- a/docker/sandbox-bundled/manifests/complete-agent.yaml +++ b/docker/sandbox-bundled/manifests/complete-agent.yaml @@ -816,7 +816,7 @@ type: Opaque --- apiVersion: v1 data: - haSharedSecret: eHMzc0p4bTZoSVVFb1V0Uw== + haSharedSecret: RXVLTERoMzJHdktxQlo4cQ== proxyPassword: "" proxyUsername: "" kind: Secret @@ -1412,7 +1412,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 174761d8b2609550a723b808cf0807c0b29cd4d3e1050ee85178a46fbca1a61c + checksum/secret: 3154841b4ef47e9abff94afa4fb18f019a8ab1bd3bfd067ee69c58a6cfa72850 labels: app: docker-registry release: flyte-sandbox diff --git a/docker/sandbox-bundled/manifests/complete.yaml b/docker/sandbox-bundled/manifests/complete.yaml index 2bdaa4bb4a..ff5ba90edd 100644 --- a/docker/sandbox-bundled/manifests/complete.yaml +++ b/docker/sandbox-bundled/manifests/complete.yaml @@ -796,7 +796,7 @@ type: Opaque --- apiVersion: v1 data: - haSharedSecret: OFdhR0JObmY4TkFWd1JaMg== + haSharedSecret: dmJQRFJKUkkxNUM4YlJVQg== proxyPassword: "" proxyUsername: "" kind: Secret @@ -1360,7 +1360,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 71ade407a1350a0e7ee684b637e0a0d15cf511e83431b5466448e1a79da1d275 + checksum/secret: 08894598b0a7ca264bfe377c29916b437ba3afbfcc12d664e0c28285913fca2c labels: app: docker-registry release: flyte-sandbox diff --git a/docker/sandbox-bundled/manifests/dev.yaml b/docker/sandbox-bundled/manifests/dev.yaml index 0ea904cd28..5692b7e353 100644 --- a/docker/sandbox-bundled/manifests/dev.yaml +++ b/docker/sandbox-bundled/manifests/dev.yaml @@ -499,7 +499,7 @@ metadata: --- apiVersion: v1 data: - haSharedSecret: b1k1Q0xLa2hkR3doaG9NMg== + haSharedSecret: amQ2V3I1QWxxWE5FR3ptbA== proxyPassword: "" proxyUsername: "" kind: Secret @@ -934,7 +934,7 @@ spec: metadata: annotations: checksum/config: 8f50e768255a87f078ba8b9879a0c174c3e045ffb46ac8723d2eedbe293c8d81 - checksum/secret: 7d2c67983dc470228224d912a242b2abcc695e391dc54f755e95dd5b820c7215 + checksum/secret: 21b7819320348d53868271005a964a9644a024a2ea50c0936db78e55d1a923e3 labels: app: docker-registry release: flyte-sandbox diff --git a/flyteadmin/cmd/entrypoints/clusterresource.go b/flyteadmin/cmd/entrypoints/clusterresource.go index bb47d8775f..578d92868e 100644 --- a/flyteadmin/cmd/entrypoints/clusterresource.go +++ b/flyteadmin/cmd/entrypoints/clusterresource.go @@ -10,6 +10,7 @@ import ( "github.com/flyteorg/flyte/flyteadmin/pkg/clusterresource" "github.com/flyteorg/flyte/flyteadmin/pkg/runtime" "github.com/flyteorg/flyte/flytestdlib/logger" + "github.com/flyteorg/flyte/flytestdlib/profutils" "github.com/flyteorg/flyte/flytestdlib/promutils" ) @@ -29,6 +30,17 @@ var controllerRunCmd = &cobra.Command{ if err != nil { return err } + + // Serve profiling endpoints. + cfg := runtime.NewConfigurationProvider() + go func() { + err := profutils.StartProfilingServerWithDefaultHandlers( + ctx, cfg.ApplicationConfiguration().GetTopLevelConfig().GetProfilerPort(), nil) + if err != nil { + logger.Panicf(ctx, "Failed to Start profiling and Metrics server. Error, %v", err) + } + }() + clusterResourceController.Run() logger.Infof(ctx, "ClusterResourceController started running successfully") return nil diff --git a/flyteadmin/pkg/clusterresource/controller.go b/flyteadmin/pkg/clusterresource/controller.go index cdf9ea0655..daad2600e8 100644 --- a/flyteadmin/pkg/clusterresource/controller.go +++ b/flyteadmin/pkg/clusterresource/controller.go @@ -69,6 +69,7 @@ type Controller interface { type controllerMetrics struct { Scope promutils.Scope + SyncErrors prometheus.Counter SyncStarted prometheus.Counter KubernetesResourcesCreated prometheus.Counter KubernetesResourcesCreateErrors prometheus.Counter @@ -615,6 +616,7 @@ func (c *controller) Sync(ctx context.Context) error { logger.Infof(ctx, "Completed cluster resource creation loop with stats: [%+v]", stats) if len(errs) > 0 { + c.metrics.SyncErrors.Add(float64(len(errs))) return errors.NewCollectedFlyteAdminError(codes.Internal, errs) } @@ -637,7 +639,8 @@ func (c *controller) Run() { func newMetrics(scope promutils.Scope) controllerMetrics { return controllerMetrics{ - Scope: scope, + Scope: scope, + SyncErrors: scope.MustNewCounter("sync_errors", "overall count of errors that occurred within a 'sync' method"), SyncStarted: scope.MustNewCounter("k8s_resource_syncs", "overall count of the number of invocations of the resource controller 'sync' method"), KubernetesResourcesCreated: scope.MustNewCounter("k8s_resources_created",