diff --git a/.github/workflows/publish-app-catalogue.yaml b/.github/workflows/publish-app-catalogue.yaml index 5afdb4523..72ec07a17 100644 --- a/.github/workflows/publish-app-catalogue.yaml +++ b/.github/workflows/publish-app-catalogue.yaml @@ -67,9 +67,9 @@ jobs: # Get the version of the truefoundry helm chart cp_chart_version=$(yq e '.spec.source.targetRevision' ./catalogues/tfy-k8s-aws-eks-inframold/templates/truefoundry.yaml) # Sync to S3 - aws s3 sync ./catalogues/tfy-k8s-aws-eks-inframold s3://tfy-argo-application-catalogue/aws-eks --delete + aws s3 sync ./catalogues/tfy-k8s-aws-eks-inframold/templates s3://tfy-argo-application-catalogue/aws-eks/templates --delete # Sync to S3 chart version folder - aws s3 sync ./catalogues/tfy-k8s-aws-eks-inframold s3://tfy-argo-application-catalogue/aws-eks/$cp_chart_version --delete + aws s3 sync ./catalogues/tfy-k8s-aws-eks-inframold/templates s3://tfy-argo-application-catalogue/aws-eks/$cp_chart_version/templates --delete echo "Synced catalogue for aws-eks successfully" - run: | echo "Render GCP standard k8s manifests" @@ -77,9 +77,9 @@ jobs: # Get the version of the truefoundry helm chart cp_chart_version=$(yq e '.spec.source.targetRevision' ./catalogues/tfy-k8s-gcp-gke-standard-inframold/templates/truefoundry.yaml) # Sync to S3 - aws s3 sync ./catalogues/tfy-k8s-gcp-gke-standard-inframold s3://tfy-argo-application-catalogue/gcp-gke-standard --delete + aws s3 sync ./catalogues/tfy-k8s-gcp-gke-standard-inframold/templates s3://tfy-argo-application-catalogue/gcp-gke-standard/templates --delete # Sync to S3 chart version folder - aws s3 sync ./catalogues/tfy-k8s-gcp-gke-standard-inframold s3://tfy-argo-application-catalogue/gcp-gke-standard/$cp_chart_version --delete + aws s3 sync ./catalogues/tfy-k8s-gcp-gke-standard-inframold/templates s3://tfy-argo-application-catalogue/gcp-gke-standard/$cp_chart_version/templates --delete echo "Synced catalogue for gcp-standard successfully" - run: | echo "Render Azure-AKS manifests" @@ -87,9 +87,9 @@ jobs: # Get the version of the truefoundry helm chart cp_chart_version=$(yq e '.spec.source.targetRevision' ./catalogues/tfy-k8s-azure-aks-inframold/templates/truefoundry.yaml) # Sync to S3 - aws s3 sync ./catalogues/tfy-k8s-azure-aks-inframold s3://tfy-argo-application-catalogue/azure-aks --delete + aws s3 sync ./catalogues/tfy-k8s-azure-aks-inframold/templates s3://tfy-argo-application-catalogue/azure-aks/templates --delete # Sync to S3 chart version folder - aws s3 sync ./catalogues/tfy-k8s-azure-aks-inframold s3://tfy-argo-application-catalogue/azure-aks/$cp_chart_version --delete + aws s3 sync ./catalogues/tfy-k8s-azure-aks-inframold/templates s3://tfy-argo-application-catalogue/azure-aks/$cp_chart_version/templates --delete echo "Synced catalogue for azure-aks successfully" - run: | echo "Render Generic k8s manifests" @@ -97,9 +97,9 @@ jobs: # Get the version of the truefoundry helm chart cp_chart_version=$(yq e '.spec.source.targetRevision' ./catalogues/tfy-k8s-generic-inframold/templates/truefoundry.yaml) # Sync to S3 - aws s3 sync ./catalogues/tfy-k8s-generic-inframold s3://tfy-argo-application-catalogue/generic --delete + aws s3 sync ./catalogues/tfy-k8s-generic-inframold/templates s3://tfy-argo-application-catalogue/generic/templates --delete # Sync to S3 chart version folder - aws s3 sync ./catalogues/tfy-k8s-generic-inframold s3://tfy-argo-application-catalogue/generic/$cp_chart_version --delete + aws s3 sync ./catalogues/tfy-k8s-generic-inframold/templates s3://tfy-argo-application-catalogue/generic/$cp_chart_version/templates --delete echo "Synced catalogue for generic k8s successfully" - run: | echo "Render Civo-Talos k8s manifests" @@ -107,9 +107,9 @@ jobs: # Get the version of the truefoundry helm chart cp_chart_version=$(yq e '.spec.source.targetRevision' ./catalogues/tfy-k8s-civo-talos-inframold/templates/truefoundry.yaml) # Sync to S3 - aws s3 sync ./catalogues/tfy-k8s-civo-talos-inframold s3://tfy-argo-application-catalogue/civo-talos --delete + aws s3 sync ./catalogues/tfy-k8s-civo-talos-inframold/templates s3://tfy-argo-application-catalogue/civo-talos/templates --delete # Sync to S3 chart version folder - aws s3 sync ./catalogues/tfy-k8s-civo-talos-inframold s3://tfy-argo-application-catalogue/civo-talos/$cp_chart_version --delete + aws s3 sync ./catalogues/tfy-k8s-civo-talos-inframold/templates s3://tfy-argo-application-catalogue/civo-talos/$cp_chart_version/templates --delete echo "Synced catalogue for civo-talos k8s successfully" - run: | # Invalidate cloudfront diff --git a/charts/tfy-agent/Chart.yaml b/charts/tfy-agent/Chart.yaml index 096294750..0beb05a1c 100644 --- a/charts/tfy-agent/Chart.yaml +++ b/charts/tfy-agent/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.2.39 +version: 0.2.42 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/tfy-agent/README.md b/charts/tfy-agent/README.md index 567189d22..8f09d7ad7 100644 --- a/charts/tfy-agent/README.md +++ b/charts/tfy-agent/README.md @@ -33,6 +33,56 @@ This application has two parts. * If the list of allowed namespaces is empty. We set up [cluster-wide access](https://github.com/truefoundry/infra-charts/blob/main/charts/tfy-agent/templates/tfy-agent-proxy-clusterrolebinding-ns.yaml) for these namespaced resources. +## Trobleshoot + +### Using self-signed certificate in control plane URL +If your control plane URL is using self-signed CA certificate, follow these steps: +1. Update CA bundle in the container by mounting your CA bundle. This can be done in two ways: + 1. using volume mounts + - create a config map using your `ca-certificate.crt` file + + `kubectl create configmap tfy-ca-cert -n tfy-agent --from-file=ca-certificate.crt` + + - add following volume and volume mounts in both tfyAgent and tfyAgentProxy + ``` + tfyAgent: + extraVolumes: + - name: ca-certificates-volume + configMap: + name: tfy-ca-cert + items: + - key: ca-certificates.crt + path: ca-certificates.crt + extraVolumeMounts: + - name: ca-certificates-volume + mountPath: /etc/ssl/certs/ca-certificates.crt + subPath: ca-certificates.crt + readOnly: true + tfyAgentProxy: + extraVolumes: + - name: ca-certificates-volume + configMap: + name: tfy-ca-cert + items: + - key: ca-certificates.crt + path: ca-certificates.crt + extraVolumeMounts: + - name: ca-certificates-volume + mountPath: /etc/ssl/certs/ca-certificates.crt + subPath: ca-certificates.crt + readOnly: true + ``` + 2. using jspolicy - [link](https://artifacthub.io/packages/helm/truefoundry/tfy-jspolicy-config) + +2. Add extraEnv in tfyAgent to allow insecure connection + ``` + tfyAgent: + extraEnvVars: + - name: NODE_TLS_REJECT_UNAUTHORIZED + value: '0' + ``` + + ## Parameters ### Configuration parameters @@ -77,7 +127,7 @@ This application has two parts. | `tfyAgent.service.type` | Type for tfyAgent Service | `ClusterIP` | | `tfyAgent.image.repository` | tfyAgent repository | `tfy.jfrog.io/tfy-images/tfy-agent` | | `tfyAgent.image.pullPolicy` | Pull policy for tfyAgent | `IfNotPresent` | -| `tfyAgent.image.tag` | Overrides the image tag whose default is the chart appVersion. | `29b288e0b59ba09cdd4bf51ef97c86bfdcf1e626` | +| `tfyAgent.image.tag` | Overrides the image tag whose default is the chart appVersion. | `abdd060d96379a09bed4d6c2ab7516a11e154bfa` | | `tfyAgent.resources.limits.cpu` | CPU resource limits for tfyAgent container. Advised to only increase the limits and not decrease it | `500m` | | `tfyAgent.resources.limits.memory` | Memory Resource limits for tfyAgent container. Advised to only increase the limits and not decrease it | `512Mi` | | `tfyAgent.resources.limits.ephemeral-storage` | Ephemeral storage Resource limits for tfyAgent container. Advised to only increase the limits and not decrease it | `256Mi` | @@ -117,7 +167,7 @@ This application has two parts. | `tfyAgentProxy.annotations` | Add annotations to tfyAgentProxy pods | `{}` | | `tfyAgentProxy.image.repository` | tfyAgentProxy repository | `tfy.jfrog.io/tfy-images/tfy-agent-proxy` | | `tfyAgentProxy.image.pullPolicy` | Pull policy for tfyAgentProxy | `IfNotPresent` | -| `tfyAgentProxy.image.tag` | Image tag whose default is the chart appVersion. | `0823e317799add6beaaa4037b81068f6c25f3bf7` | +| `tfyAgentProxy.image.tag` | Image tag whose default is the chart appVersion. | `fcfb8e398eb01f05fb72cd9115b9ec69a89b9cce` | | `tfyAgentProxy.extraEnvVars` | Additional envrionment variables for tfyAgentPRoxy | `[]` | | `tfyAgentProxy.resources.limits.cpu` | CPU resource limits for tfyAgentProxy container. Advised to only increase the limits and not decrease it | `500m` | | `tfyAgentProxy.resources.limits.memory` | Memory Resource limits for tfyAgentProxy container. Advised to only increase the limits and not decrease it | `512Mi` | @@ -133,6 +183,8 @@ This application has two parts. | `tfyAgentProxy.serviceAccount.create` | Bool to enable serviceAccount creation | `true` | | `tfyAgentProxy.serviceAccount.annotations` | Annotations to add to the serviceAccount | `{}` | | `tfyAgentProxy.serviceAccount.name` | Name of the serviceAccount to use. If not set and create is true, a name is generated using the fullname template | `""` | +| `tfyAgentProxy.extraVolumes` | Extra volume for tfyAgentProxy container | `[]` | +| `tfyAgentProxy.extraVolumeMounts` | Extra volume mount for tfyAgentProxy container | `[]` | | `tfyAgentProxy.clusterRole.enable` | Create cluster role. | `true` | | `tfyAgentProxy.clusterRole.strictMode` | Only add required authz rules. | `false` | | `tfyAgentProxy.clusterRole.clusterScopedAdditionalClusterRoleRules` | Additional rules to add to the cluster role for cluster-scoped resources. | `[]` | diff --git a/charts/tfy-agent/templates/tfy-agent-proxy-deployment.yaml b/charts/tfy-agent/templates/tfy-agent-proxy-deployment.yaml index db42b4d1e..568bf1eed 100644 --- a/charts/tfy-agent/templates/tfy-agent-proxy-deployment.yaml +++ b/charts/tfy-agent/templates/tfy-agent-proxy-deployment.yaml @@ -51,6 +51,8 @@ spec: imagePullPolicy: {{ .Values.tfyAgentProxy.image.pullPolicy }} resources: {{- toYaml .Values.tfyAgentProxy.resources | nindent 12 }} + volumeMounts: + {{- toYaml .Values.tfyAgentProxy.extraVolumeMounts | nindent 12 }} {{- with .Values.tfyAgentProxy.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} @@ -66,4 +68,6 @@ spec: {{- with .Values.tfyAgentProxy.priorityClassName }} priorityClassName: {{ . | quote }} {{- end }} + volumes: + {{- toYaml .Values.tfyAgentProxy.extraVolumes | nindent 8 }} {{- end }} diff --git a/charts/tfy-agent/values.yaml b/charts/tfy-agent/values.yaml index 1de75514e..8adaef74e 100644 --- a/charts/tfy-agent/values.yaml +++ b/charts/tfy-agent/values.yaml @@ -25,8 +25,7 @@ config: ## @param config.controlPlaneClusterIP ClusterIP of the control plane to connect agent (format: `http://`) ## controlPlaneClusterIP: "http://truefoundry-truefoundry-frontend-app.truefoundry.svc.cluster.local:5000" - - ## @param config.controlPlaneControllerClusterIP ClusterIP of the control plane controller to connect proxy (format: `http://`) + ## @param config.controlPlaneControllerClusterIP ClusterIP of the control plane controller to connect proxy (format: `http://`) ## controlPlaneControllerClusterIP: "http://truefoundry-tfy-controller.truefoundry.svc.cluster.local:8123" @@ -73,8 +72,8 @@ config: # - default # - namespace-1 -## @param imagePullSecrets Secrets to pull images -## + ## @param imagePullSecrets Secrets to pull images + ## imagePullSecrets: [] ## @param nameOverride String to override partial name passed in helm install command @@ -141,7 +140,7 @@ tfyAgent: ## @param tfyAgent.service.port Port for tfyAgent service ## port: 3000 - + ## @param tfyAgent.service.nodePort Port to expose on each node. Only used if service.type is 'NodePort' ## nodePort: "" @@ -170,7 +169,7 @@ tfyAgent: ## pullPolicy: IfNotPresent ## @param tfyAgent.image.tag Overrides the image tag whose default is the chart appVersion. - tag: "29b288e0b59ba09cdd4bf51ef97c86bfdcf1e626" + tag: "abdd060d96379a09bed4d6c2ab7516a11e154bfa" ## Define resources requests and limits for single Pods. ## ref: https://kubernetes.io/docs/user-guide/compute-resources/ @@ -322,7 +321,7 @@ tfyAgentProxy: pullPolicy: IfNotPresent ## @param tfyAgentProxy.image.tag Image tag whose default is the chart appVersion. ## - tag: "0823e317799add6beaaa4037b81068f6c25f3bf7" + tag: "fcfb8e398eb01f05fb72cd9115b9ec69a89b9cce" ## @param tfyAgentProxy.extraEnvVars Additional envrionment variables for tfyAgentPRoxy ## @@ -420,6 +419,13 @@ tfyAgentProxy: ## @param tfyAgentProxy.serviceAccount.name Name of the serviceAccount to use. If not set and create is true, a name is generated using the fullname template ## name: "" + + ## @param tfyAgentProxy.extraVolumes Extra volume for tfyAgentProxy container + ## + extraVolumes: [] + ## @param tfyAgentProxy.extraVolumeMounts Extra volume mount for tfyAgentProxy container + ## + extraVolumeMounts: [] clusterRole: ## @param tfyAgentProxy.clusterRole.enable Create cluster role. @@ -440,8 +446,8 @@ tfyAgentProxy: # resources: ["namespaces"] # verbs: ["create"] -## @section resourceQuota Add a ResourceQuota to enable priority class in a namspace. -## + ## @section resourceQuota Add a ResourceQuota to enable priority class in a namspace. + ## resourceQuota: ## @param resourceQuota.enabled Create the ResourceQuota. enabled: true diff --git a/charts/tfy-buildkitd-service/Chart.yaml b/charts/tfy-buildkitd-service/Chart.yaml index 5c2a13ad3..b06c0ef39 100644 --- a/charts/tfy-buildkitd-service/Chart.yaml +++ b/charts/tfy-buildkitd-service/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: tfy-buildkitd-service description: Buildkitd service chart type: application -version: 0.2.1-rc.1 +version: 0.2.1 appVersion: "0.16.0" maintainers: - name: truefoundry diff --git a/charts/tfy-buildkitd-service/README.md b/charts/tfy-buildkitd-service/README.md index 312621539..02c722c50 100644 --- a/charts/tfy-buildkitd-service/README.md +++ b/charts/tfy-buildkitd-service/README.md @@ -3,7 +3,115 @@ Tfy-buildkitd-service is a Helm chart provided by TrueFoundry that facilitates t ## Parameters -### Parameters for tfyBuildkitdService +###### Parameters for tfyBuildkitdService + +| Name | Description | Value | +| -------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | --------------------------- | +| `replicaCount` | Number of replicas of Value kept for future use, kept 1 | `1` | +| `image.repository` | tfyBuildkitdService repository | `moby/buildkit` | +| `image.pullPolicy` | Pull policy for tfyBuildkitdService | `IfNotPresent` | +| `image.tag` | Image tag whose default is the chart appVersion. | `v0.16.0` | +| `storage.accessModes` | Access mode for tfyBuildkitdService | `["ReadWriteOnce"]` | +| `storage.storageClassName` | Storage class name for tfyBuildkitdService | `""` | +| `storage.size` | Size of the storage for tfyBuildkitdService | `200Gi` | +| `imagePullSecrets` | Secrets to pull images | `[]` | +| `nameOverride` | String to override partial name passed in helm install command | `""` | +| `fullnameOverride` | String to override full name passed in helm install command | `""` | +| `serviceAccount.create` | Bool to enable serviceAccount creation | `true` | +| `serviceAccount.annotations` | Annotations to add to the serviceAccount | `{}` | +| `serviceAccount.name` | Name of the serviceAccount to use. If not set and create is true, a name is generated using the fullname template | `""` | +| `podAnnotations` | Annotations to be added to the pod | `{}` | +| `podSecurityContext` | Security context for the pod | `{}` | +| `securityContext.privileged` | Security Context for the tfyBuildkitdServiceProxy container | `true` | +| `service.type` | Type for tfyBuildkitdService Service | `ClusterIP` | +| `service.port` | Port for tfyBuildkitdService service | `1234` | +| `resources.limits.cpu` | CPU resource limits for tfyBuildkitdService container. | `2500m` | +| `resources.limits.memory` | Memory Resource limits for tfyBuildkitdService container. | `8Gi` | +| `resources.limits.ephemeral-storage` | Ephemeral Storage limits for tfyBuildkitdService container. | `100Mi` | +| `resources.requests.cpu` | CPU resource requests for tfyBuildkitdService container. | `2500m` | +| `resources.requests.memory` | Memory Resource requests for tfyBuildkitdService container. | `8Gi` | +| `resources.requests.ephemeral-storage` | Ephemeral Storage requests for tfyBuildkitdService container. | `100Mi` | +| `extraVolumes` | List of Volumes to attach to tfyBuildkitdService container | `[]` | +| `extraVolumeMounts` | List of Volume Mounts to attach to tfyBuildkitdService container | `[]` | +| `extraEnvs` | List of Environment Variables to attach to tfyBuildkitdService container | `[]` | +| `nodeSelector` | Parameters to select for scheduling of pod on a node | `{}` | +| `tolerations` | Taints that pod can tolerate | `[]` | +| `affinity` | Affinity rules for pod scheduling on a node | `{}` | +| `tls.buildkitDaemonCertsSecretName` | Name of secret containing the buildkit daemon certs | `tfy-buildkit-daemon-certs` | +| `tls.buildkitClientCertsSecretName` | Name of secret containing the buildkit client certs | `tfy-buildkit-client-certs` | +| `tls.enabled` | Enable TLS for buildkitd | `false` | + +| Name | Description | Value | +| -------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | --------------------------- | +| `replicaCount` | Number of replicas of Value kept for future use, kept 1 | `1` | +| `image.repository` | tfyBuildkitdService repository | `moby/buildkit` | +| `image.pullPolicy` | Pull policy for tfyBuildkitdService | `IfNotPresent` | +| `image.tag` | Image tag whose default is the chart appVersion. | `v0.16.0` | +| `storage.accessModes` | Access mode for tfyBuildkitdService | `["ReadWriteOnce"]` | +| `storage.storageClassName` | Storage class name for tfyBuildkitdService | `""` | +| `storage.size` | Size of the storage for tfyBuildkitdService | `200Gi` | +| `imagePullSecrets` | Secrets to pull images | `[]` | +| `nameOverride` | String to override partial name passed in helm install command | `""` | +| `fullnameOverride` | String to override full name passed in helm install command | `""` | +| `serviceAccount.create` | Bool to enable serviceAccount creation | `true` | +| `serviceAccount.annotations` | Annotations to add to the serviceAccount | `{}` | +| `serviceAccount.name` | Name of the serviceAccount to use. If not set and create is true, a name is generated using the fullname template | `""` | +| `podAnnotations` | Annotations to be added to the pod | `{}` | +| `podSecurityContext` | Security context for the pod | `{}` | +| `securityContext.privileged` | Security Context for the tfyBuildkitdServiceProxy container | `true` | +| `service.type` | Type for tfyBuildkitdService Service | `ClusterIP` | +| `service.port` | Port for tfyBuildkitdService service | `1234` | +| `resources.limits.cpu` | CPU resource limits for tfyBuildkitdService container. | `2000m` | +| `resources.limits.memory` | Memory Resource limits for tfyBuildkitdService container. | `8Gi` | +| `resources.limits.ephemeral-storage` | Ephemeral Storage limits for tfyBuildkitdService container. | `100Mi` | +| `resources.requests.cpu` | CPU resource requests for tfyBuildkitdService container. | `2000m` | +| `resources.requests.memory` | Memory Resource requests for tfyBuildkitdService container. | `8Gi` | +| `resources.requests.ephemeral-storage` | Ephemeral Storage requests for tfyBuildkitdService container. | `100Mi` | +| `extraVolumes` | List of Volumes to attach to tfyBuildkitdService container | `[]` | +| `extraVolumeMounts` | List of Volume Mounts to attach to tfyBuildkitdService container | `[]` | +| `extraEnvs` | List of Environment Variables to attach to tfyBuildkitdService container | `[]` | +| `nodeSelector` | Parameters to select for scheduling of pod on a node | `{}` | +| `tolerations` | Taints that pod can tolerate | `[]` | +| `affinity` | Affinity rules for pod scheduling on a node | `{}` | +| `tls.buildkitDaemonCertsSecretName` | Name of secret containing the buildkit daemon certs | `tfy-buildkit-daemon-certs` | +| `tls.buildkitClientCertsSecretName` | Name of secret containing the buildkit client certs | `tfy-buildkit-client-certs` | +| `tls.enabled` | Enable TLS for buildkitd | `false` | + +| Name | Description | Value | +| -------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | --------------------------- | +| `replicaCount` | Number of replicas of Value kept for future use, kept 1 | `1` | +| `image.repository` | tfyBuildkitdService repository | `moby/buildkit` | +| `image.pullPolicy` | Pull policy for tfyBuildkitdService | `IfNotPresent` | +| `image.tag` | Image tag whose default is the chart appVersion. | `v0.16.0` | +| `storage.accessModes` | Access mode for tfyBuildkitdService | `["ReadWriteOnce"]` | +| `storage.storageClassName` | Storage class name for tfyBuildkitdService | `""` | +| `storage.size` | Size of the storage for tfyBuildkitdService | `200Gi` | +| `imagePullSecrets` | Secrets to pull images | `[]` | +| `nameOverride` | String to override partial name passed in helm install command | `""` | +| `fullnameOverride` | String to override full name passed in helm install command | `""` | +| `serviceAccount.create` | Bool to enable serviceAccount creation | `true` | +| `serviceAccount.annotations` | Annotations to add to the serviceAccount | `{}` | +| `serviceAccount.name` | Name of the serviceAccount to use. If not set and create is true, a name is generated using the fullname template | `""` | +| `podAnnotations` | Annotations to be added to the pod | `{}` | +| `podSecurityContext` | Security context for the pod | `{}` | +| `securityContext.privileged` | Security Context for the tfyBuildkitdServiceProxy container | `true` | +| `service.type` | Type for tfyBuildkitdService Service | `ClusterIP` | +| `service.port` | Port for tfyBuildkitdService service | `1234` | +| `resources.limits.cpu` | CPU resource limits for tfyBuildkitdService container. | `3500m` | +| `resources.limits.memory` | Memory Resource limits for tfyBuildkitdService container. | `13.2Gi` | +| `resources.limits.ephemeral-storage` | Ephemeral Storage limits for tfyBuildkitdService container. | `100Mi` | +| `resources.requests.cpu` | CPU resource requests for tfyBuildkitdService container. | `3500m` | +| `resources.requests.memory` | Memory Resource requests for tfyBuildkitdService container. | `13.2Gi` | +| `resources.requests.ephemeral-storage` | Ephemeral Storage requests for tfyBuildkitdService container. | `100Mi` | +| `extraVolumes` | List of Volumes to attach to tfyBuildkitdService container | `[]` | +| `extraVolumeMounts` | List of Volume Mounts to attach to tfyBuildkitdService container | `[]` | +| `extraEnvs` | List of Environment Variables to attach to tfyBuildkitdService container | `[]` | +| `nodeSelector` | Parameters to select for scheduling of pod on a node | `{}` | +| `tolerations` | Taints that pod can tolerate | `[]` | +| `affinity` | Affinity rules for pod scheduling on a node | `{}` | +| `tls.buildkitDaemonCertsSecretName` | Name of secret containing the buildkit daemon certs | `tfy-buildkit-daemon-certs` | +| `tls.buildkitClientCertsSecretName` | Name of secret containing the buildkit client certs | `tfy-buildkit-client-certs` | +| `tls.enabled` | Enable TLS for buildkitd | `false` | | Name | Description | Value | | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | --------------------------- | diff --git a/charts/tfy-buildkitd-service/templates/statefulset.yaml b/charts/tfy-buildkitd-service/templates/statefulset.yaml index bf96872f1..ca2816168 100644 --- a/charts/tfy-buildkitd-service/templates/statefulset.yaml +++ b/charts/tfy-buildkitd-service/templates/statefulset.yaml @@ -112,7 +112,7 @@ spec: mountPath: /etc/buildkit/client/certs/ {{- end }} {{- with .Values.extraVolumeMounts }} - {{- toYaml .Values.extraVolumeMounts | nindent 12 }} + {{- toYaml . | nindent 12 }} {{- end }} volumes: - name: buildkit-config @@ -127,7 +127,7 @@ spec: secretName: {{ .Values.tls.buildkitClientCertsSecretName }} {{- end }} {{- with .Values.extraVolumes }} - {{- toYaml .Values.extraVolumes | nindent 8 }} + {{- toYaml . | nindent 8 }} {{- end }} {{- with .Values.nodeSelector }} nodeSelector: diff --git a/charts/tfy-buildkitd-service/values.yaml b/charts/tfy-buildkitd-service/values.yaml index c8cd7359c..e6a3cc11d 100644 --- a/charts/tfy-buildkitd-service/values.yaml +++ b/charts/tfy-buildkitd-service/values.yaml @@ -28,7 +28,7 @@ buildkitConfig: | enabled = true gc = true gckeepstorage = "107400000000" - max-parallelism = 4 + max-parallelism = 3 [[worker.oci.gcpolicy]] @@ -119,12 +119,12 @@ service: ## resources: limits: - cpu: 3500m - memory: 13.2Gi + cpu: 2500m + memory: 8Gi ephemeral-storage: 100Mi requests: - cpu: 3500m - memory: 13.2Gi + cpu: 2500m + memory: 8Gi ephemeral-storage: 100Mi ## @param extraVolumes List of Volumes to attach to tfyBuildkitdService container diff --git a/charts/tfy-inferentia-operator/README.md b/charts/tfy-inferentia-operator/README.md index 2a350bbf9..f70ceb87b 100644 --- a/charts/tfy-inferentia-operator/README.md +++ b/charts/tfy-inferentia-operator/README.md @@ -12,15 +12,16 @@ Refer to, ### Configuration for the device plugin responsible for node feature discovery -| Name | Description | Value | -| ---------------------------------------- | ---------------------------------------------- | ------------------------------------------------------ | -| `devicePlugin.enabled` | Enable device plugin Daemonset. | `true` | -| `devicePlugin.resources.requests.cpu` | CPU request for device plugin Daemonset. | `100m` | -| `devicePlugin.resources.requests.memory` | Memory request for device plugin Daemonset. | `128Mi` | -| `devicePlugin.image` | Image to use for device plugin Daemonset. | `public.ecr.aws/neuron/neuron-device-plugin:2.16.18.0` | -| `imagePullSecrets` | (global) List of image pull secrets | `[]` | -| `devicePlugin.imagePullSecrets` | List of image pull secrets | `[]` | -| `devicePlugin.affinity` | Affinity settings for device plugin Daemonset. | `{}` | +| Name | Description | Value | +| ---------------------------------------- | ---------------------------------------------------- | -------------------------------------------- | +| `devicePlugin.enabled` | Enable device plugin Daemonset. | `true` | +| `devicePlugin.resources.requests.cpu` | CPU request for device plugin Daemonset. | `100m` | +| `devicePlugin.resources.requests.memory` | Memory request for device plugin Daemonset. | `128Mi` | +| `devicePlugin.image.repository` | Image repository to use for device plugin Daemonset. | `public.ecr.aws/neuron/neuron-device-plugin` | +| `devicePlugin.image.tag` | Image tag to use for device plugin Daemonset. | `2.16.18.0` | +| `imagePullSecrets` | (global) List of image pull secrets | `[]` | +| `devicePlugin.imagePullSecrets` | List of image pull secrets | `[]` | +| `devicePlugin.affinity` | Affinity settings for device plugin Daemonset. | `{}` | ### Configuration for the scheduler responsible for scheduling neuron pods diff --git a/charts/tfy-inferentia-operator/templates/neuron-device-plugin.yaml b/charts/tfy-inferentia-operator/templates/neuron-device-plugin.yaml index 203fe327d..5091b4e07 100644 --- a/charts/tfy-inferentia-operator/templates/neuron-device-plugin.yaml +++ b/charts/tfy-inferentia-operator/templates/neuron-device-plugin.yaml @@ -88,7 +88,7 @@ spec: affinity: {{- toYaml (index .Values "devicePlugin" "affinity") | nindent 8 }} containers: - - image: {{ .Values.devicePlugin.image }} + - image: {{ .Values.devicePlugin.image.repository }}:{{ .Values.devicePlugin.image.tag }} imagePullPolicy: Always name: neuron-device-plugin resources: diff --git a/charts/tfy-inferentia-operator/values.yaml b/charts/tfy-inferentia-operator/values.yaml index 47cb6fe07..c15ee852b 100644 --- a/charts/tfy-inferentia-operator/values.yaml +++ b/charts/tfy-inferentia-operator/values.yaml @@ -2,7 +2,8 @@ ## @param devicePlugin.enabled Enable device plugin Daemonset. ## @param devicePlugin.resources.requests.cpu CPU request for device plugin Daemonset. ## @param devicePlugin.resources.requests.memory Memory request for device plugin Daemonset. -## @param devicePlugin.image Image to use for device plugin Daemonset. +## @param devicePlugin.image.repository Image repository to use for device plugin Daemonset. +## @param devicePlugin.image.tag Image tag to use for device plugin Daemonset. ## @param imagePullSecrets (global) List of image pull secrets ## @param devicePlugin.imagePullSecrets List of image pull secrets ## @param devicePlugin.affinity [object] Affinity settings for device plugin Daemonset. @@ -13,7 +14,9 @@ devicePlugin: requests: cpu: 100m memory: 128Mi - image: public.ecr.aws/neuron/neuron-device-plugin:2.16.18.0 + image: + repository: public.ecr.aws/neuron/neuron-device-plugin + tag: 2.16.18.0 imagePullSecrets: [] tolerations: ## @skip devicePlugin.tolerations[0] diff --git a/charts/tfy-k8s-aws-eks-inframold/Chart.yaml b/charts/tfy-k8s-aws-eks-inframold/Chart.yaml index 78ce82fe0..5850dbe9b 100644 --- a/charts/tfy-k8s-aws-eks-inframold/Chart.yaml +++ b/charts/tfy-k8s-aws-eks-inframold/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: tfy-k8s-aws-eks-inframold -version: 0.1.11 +version: 0.1.12 description: "Inframold, the superchart that configure your cluster on aws for truefoundry." maintainers: - name: truefoundry diff --git a/charts/tfy-k8s-aws-eks-inframold/artifacts-manifest.json b/charts/tfy-k8s-aws-eks-inframold/artifacts-manifest.json index 193349a9c..0e2b84513 100644 --- a/charts/tfy-k8s-aws-eks-inframold/artifacts-manifest.json +++ b/charts/tfy-k8s-aws-eks-inframold/artifacts-manifest.json @@ -174,10 +174,10 @@ "details": { "chart": "tfy-agent", "repoURL": "https://truefoundry.github.io/infra-charts/", - "targetRevision": "0.2.35", + "targetRevision": "0.2.42", "images": [ - "tfy.jfrog.io/tfy-images/tfy-agent:c610a56137336ab818510cb42680add0ae3db54e", - "tfy.jfrog.io/tfy-images/tfy-agent-proxy:0823e317799add6beaaa4037b81068f6c25f3bf7" + "tfy.jfrog.io/tfy-images/tfy-agent:abdd060d96379a09bed4d6c2ab7516a11e154bfa", + "tfy.jfrog.io/tfy-images/tfy-agent-proxy:fcfb8e398eb01f05fb72cd9115b9ec69a89b9cce" ] } }, @@ -266,14 +266,14 @@ "details": { "chart": "truefoundry", "repoURL": "https://truefoundry.github.io/infra-charts", - "targetRevision": "0.11.1", + "targetRevision": "0.13.2", "images": [ - "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.8.1", - "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.11.0", - "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.8.0", - "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.6.0", - "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.8.0", - "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.10.0", + "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.10.0", + "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.13.2", + "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.10.0", + "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.8.0", + "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.10.0", + "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.12.0", "tfy.jfrog.io/tfy-mirror/nats:2.10.21-alpine3.20", "tfy.jfrog.io/tfy-mirror/natsio/nats-server-config-reloader:0.14.3", "tfy.jfrog.io/tfy-mirror/natsio/prometheus-nats-exporter:0.15.0", @@ -965,33 +965,15 @@ { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent:c610a56137336ab818510cb42680add0ae3db54e", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent:abdd060d96379a09bed4d6c2ab7516a11e154bfa", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent-proxy:0823e317799add6beaaa4037b81068f6c25f3bf7", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent-proxy:fcfb8e398eb01f05fb72cd9115b9ec69a89b9cce", + "platforms": [] } }, { @@ -1227,88 +1209,43 @@ { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.8.1", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.10.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.11.0", + "registryURL": "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.13.2", "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.8.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.10.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.6.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.8.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.8.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.10.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.10.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.12.0", + "platforms": [] } }, { diff --git a/charts/tfy-k8s-aws-eks-inframold/templates/prometheus.yaml b/charts/tfy-k8s-aws-eks-inframold/templates/prometheus.yaml index b287462a7..ab6f20f66 100644 --- a/charts/tfy-k8s-aws-eks-inframold/templates/prometheus.yaml +++ b/charts/tfy-k8s-aws-eks-inframold/templates/prometheus.yaml @@ -25,6 +25,8 @@ spec: chart: kube-prometheus-stack helm: values: | + defaultRules: + enabled: false coreDns: enabled: false grafana: @@ -423,42 +425,41 @@ spec: labels: daemonset: 'true' record: kubecost_savings_memory_allocation_bytes - - name: GoogleCadvisor + - name: Alerting rules: - for: 2m expr: >- - (sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) + sum(rate(container_cpu_usage_seconds_total{job="kubelet", container!=""}[5m])) by (pod, container, namespace) / - sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) - by (pod, container, namespace) * 100) > 80 - alert: ContainerHighCpuUtilization + sum(kube_pod_container_resource_requests{job="kube-state-metrics", resource="cpu"}) + by (pod, container, namespace) * 100 > 80 + alert: CPUHighUtilizationForRequested labels: severity: warning annotations: summary: >- Container High CPU utilization (instance {{ `{{ $labels.instance }}` }}) description: |- - Container CPU utilization is above 80% + Container CPU utilization is above 80% of requested CPU VALUE = {{`{{ $value }}`}} - for: 2m expr: >- - (sum(container_memory_working_set_bytes{container!=""}) BY (pod, - container, namespace) / sum(container_spec_memory_limit_bytes > 0) BY (pod, - container, namespace) * 100) > 80 - alert: ContainerHighMemoryUsage + sum(container_memory_working_set_bytes{job="kubelet", container!=""}) by (pod, container, namespace) / + sum(kube_pod_container_resource_requests{job="kube-state-metrics", resource="memory"}) by (pod, container, namespace) * 100 > 80 + alert: MemoryHighUtilizationForRequested labels: severity: warning annotations: summary: Container High Memory usage (instance {{ `{{ $labels.instance }}` }}) description: |- - Container Memory usage is above 80% + Container Memory usage is above 80% of requested memory VALUE = {{`{{ $value }}`}} - for: 5m expr: >- - sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) - by (container, pod, namespace) / - sum(increase(container_cpu_cfs_periods_total[5m])) by - (container, pod, namespace) > ( 25 / 100 ) + sum(increase(container_cpu_cfs_throttled_periods_total{job="kubelet", container!=""}[5m])) + by (pod, container, namespace) / + sum(increase(container_cpu_cfs_periods_total{job="kubelet"}[5m])) by + (pod, container, namespace) > ( 25 / 100 ) alert: ContainerHighThrottleRate labels: severity: warning @@ -467,42 +468,38 @@ spec: description: |- Container is being throttled VALUE = {{`{{ $value }}`}} - - name: KubestateExporter - rules: - for: 2m expr: >- - (kube_horizontalpodautoscaler_spec_max_replicas - - kube_horizontalpodautoscaler_status_desired_replicas) * on - (horizontalpodautoscaler,namespace, pod) - (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", - status="true"} == 1) == 0 - alert: KubernetesHpaScaleInability + sum(rate(container_cpu_usage_seconds_total{job="kubelet", container!=""}[5m])) + by (pod, container, namespace) / + sum(kube_pod_container_resource_limits{job="kube-state-metrics", resource="cpu"}) + by (pod, container, namespace) * 100 > 70 + alert: CPUHighUtilizationForLimit labels: - severity: warning + severity: critical annotations: summary: >- - Kubernetes HPA scale inability (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} is unable to scale + Container High CPU utilization (instance {{ `{{ $labels.instance }}` }}) + description: |- + Container CPU utilization is above 70% of CPU limit VALUE = {{`{{ $value }}`}} - - for: 0m + - for: 2m expr: >- - kube_horizontalpodautoscaler_status_condition{status="false", - condition="ScalingActive"} == 1 - alert: KubernetesHpaMetricsUnavailability + sum(container_memory_working_set_bytes{job="kubelet", container!=""}) by (pod, container, namespace) / + sum(kube_pod_container_resource_limits{job="kube-state-metrics", resource="memory"}) by (pod, container, namespace) * 100 > 70 + alert: MemoryHighUtilizationForLimit labels: - severity: warning + severity: critical annotations: summary: >- - Kubernetes HPA metrics unavailability (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - is unable to collect metrics + Container High Memory usage (instance {{ `{{ $labels.instance }}` }}) + description: |- + Container Memory usage is above 70% of memory limit VALUE = {{`{{ $value }}`}} - for: 15m expr: >- sum by (namespace, pod) - (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0 + (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown|Failed"}) > 0 alert: KubernetesPodNotHealthy labels: severity: critical @@ -514,7 +511,7 @@ spec: non-running state for longer than 15 minutes. VALUE = {{`{{ $value }}`}} - for: 2m - expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 + expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[1m]) > 3 alert: KubernetesPodCrashLooping labels: severity: warning @@ -526,34 +523,86 @@ spec: VALUE = {{`{{ $value }}`}} - for: 2m expr: >- - (kube_horizontalpodautoscaler_status_desired_replicas >= - kube_horizontalpodautoscaler_spec_max_replicas) and - (kube_horizontalpodautoscaler_spec_max_replicas > 1) and - (kube_horizontalpodautoscaler_spec_min_replicas != - kube_horizontalpodautoscaler_spec_max_replicas) - alert: KubernetesHpaScaleMaximum + sum(increase(container_oom_events_total{job="kubelet", container!=""}[5m])) + by (container, pod, namespace) > 0 + alert: ContainerOOMKilled labels: - severity: info + severity: critical annotations: - summary: Kubernetes HPA scale maximum (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - has hit maximum number of desired pods - VALUE = {{`{{ $value }}`}} - - for: 0m + summary: >- + Container OOM Killed ({{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}}/{{`{{ $labels.container }}`}}) + description: |- + Container {{`{{ $labels.container }}`}} in pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} + has been OOM killed in the last 5 minutes + VALUE = {{`{{ $value }}`}} times + - for: 10m expr: >- - max(quantile_over_time(0.5, - kube_horizontalpodautoscaler_status_desired_replicas[1d]) == - kube_horizontalpodautoscaler_spec_min_replicas) by - (horizontalpodautoscaler) > 3 - alert: KubernetesHpaUnderutilized + (sum(kubelet_volume_stats_used_bytes{job="kubelet"}) by (persistentvolumeclaim, namespace) / + sum(kubelet_volume_stats_capacity_bytes{job="kubelet"}) by (persistentvolumeclaim, namespace)) * 100 > 90 + alert: PersistentVolumeUsageHigh labels: - severity: info + severity: critical annotations: - summary: Kubernetes HPA underutilized (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - is constantly at minimum replicas for 50% of the time. - Potential cost saving here. + summary: >- + Persistent Volume usage is above 90% ({{`{{ $labels.namespace }}`}}/{{`{{ $labels.persistentvolumeclaim }}`}}) + description: |- + Persistent Volume {{`{{ $labels.persistentvolumeclaim }}`}} in namespace {{`{{ $labels.namespace }}`}} + is using more than 90% of its capacity. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: (1 - (node_filesystem_avail_bytes{job="node-exporter"} / node_filesystem_size_bytes{job="node-exporter"})) > 0.9 + alert: NodeDiskPressure + labels: + severity: critical + annotations: + summary: >- + Node disk usage is above 90% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing disk pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} < 0.1 + alert: NodeMemoryPressure + labels: + severity: critical + annotations: + summary: >- + Node memory available is below 10% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing memory pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m]) < 0.1 + alert: NodeCPUPressure + labels: + severity: critical + annotations: + summary: >- + Node CPU usage is above 90% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing CPU pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: kube_node_status_condition{job="kube-state-metrics", condition="Ready", status="true"} == 0 + alert: NodeNotReady + labels: + severity: critical + annotations: + summary: >- + Node is not ready ({{`{{ $labels.node }}`}}) + description: |- + Node {{`{{ $labels.node }}`}} is not in a ready state. + VALUE = {{`{{ $value }}`}} + - for: 5m + expr: kube_node_status_condition{job="kube-state-metrics", condition="NetworkUnavailable", status="true"} == 1 + alert: NodeNetworkUnavailable + labels: + severity: critical + annotations: + summary: >- + Node network is unavailable ({{`{{ $labels.node }}`}}) + description: |- + Node {{`{{ $labels.node }}`}} network is unavailable. VALUE = {{`{{ $value }}`}} + {{- end }} diff --git a/charts/tfy-k8s-aws-eks-inframold/templates/tfy-agent.yaml b/charts/tfy-k8s-aws-eks-inframold/templates/tfy-agent.yaml index 45c2a8262..61d6f8ea2 100644 --- a/charts/tfy-k8s-aws-eks-inframold/templates/tfy-agent.yaml +++ b/charts/tfy-k8s-aws-eks-inframold/templates/tfy-agent.yaml @@ -14,7 +14,7 @@ spec: server: https://kubernetes.default.svc project: tfy-apps source: - targetRevision: 0.2.35 + targetRevision: 0.2.42 repoURL: https://truefoundry.github.io/infra-charts/ chart: tfy-agent helm: diff --git a/charts/tfy-k8s-aws-eks-inframold/templates/truefoundry.yaml b/charts/tfy-k8s-aws-eks-inframold/templates/truefoundry.yaml index da91b6a75..03f06ccd8 100644 --- a/charts/tfy-k8s-aws-eks-inframold/templates/truefoundry.yaml +++ b/charts/tfy-k8s-aws-eks-inframold/templates/truefoundry.yaml @@ -14,7 +14,7 @@ spec: server: https://kubernetes.default.svc project: tfy-apps source: - targetRevision: 0.11.1 + targetRevision: 0.13.2 repoURL: "https://truefoundry.github.io/infra-charts" chart: truefoundry helm: diff --git a/charts/tfy-k8s-azure-aks-inframold/Chart.yaml b/charts/tfy-k8s-azure-aks-inframold/Chart.yaml index ac98807e3..730f7bfc9 100644 --- a/charts/tfy-k8s-azure-aks-inframold/Chart.yaml +++ b/charts/tfy-k8s-azure-aks-inframold/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: tfy-k8s-azure-aks-inframold -version: 0.1.11 +version: 0.1.12 description: "Inframold, the superchart that configure your cluster on azure for truefoundry." maintainers: - name: truefoundry diff --git a/charts/tfy-k8s-azure-aks-inframold/artifacts-manifest.json b/charts/tfy-k8s-azure-aks-inframold/artifacts-manifest.json index f0fee2dec..39005cd2d 100644 --- a/charts/tfy-k8s-azure-aks-inframold/artifacts-manifest.json +++ b/charts/tfy-k8s-azure-aks-inframold/artifacts-manifest.json @@ -163,10 +163,10 @@ "details": { "chart": "tfy-agent", "repoURL": "https://truefoundry.github.io/infra-charts/", - "targetRevision": "0.2.35", + "targetRevision": "0.2.42", "images": [ - "tfy.jfrog.io/tfy-images/tfy-agent:c610a56137336ab818510cb42680add0ae3db54e", - "tfy.jfrog.io/tfy-images/tfy-agent-proxy:0823e317799add6beaaa4037b81068f6c25f3bf7" + "tfy.jfrog.io/tfy-images/tfy-agent:abdd060d96379a09bed4d6c2ab7516a11e154bfa", + "tfy.jfrog.io/tfy-images/tfy-agent-proxy:fcfb8e398eb01f05fb72cd9115b9ec69a89b9cce" ] } }, @@ -196,14 +196,14 @@ "details": { "chart": "truefoundry", "repoURL": "https://truefoundry.github.io/infra-charts", - "targetRevision": "0.11.1", + "targetRevision": "0.13.2", "images": [ - "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.8.1", - "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.11.0", - "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.8.0", - "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.6.0", - "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.8.0", - "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.10.0", + "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.10.0", + "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.13.2", + "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.10.0", + "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.8.0", + "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.10.0", + "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.12.0", "tfy.jfrog.io/tfy-mirror/nats:2.10.21-alpine3.20", "tfy.jfrog.io/tfy-mirror/natsio/nats-server-config-reloader:0.14.3", "tfy.jfrog.io/tfy-mirror/natsio/prometheus-nats-exporter:0.15.0", @@ -850,33 +850,15 @@ { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent:c610a56137336ab818510cb42680add0ae3db54e", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent:abdd060d96379a09bed4d6c2ab7516a11e154bfa", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent-proxy:0823e317799add6beaaa4037b81068f6c25f3bf7", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent-proxy:fcfb8e398eb01f05fb72cd9115b9ec69a89b9cce", + "platforms": [] } }, { @@ -914,88 +896,43 @@ { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.8.1", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.10.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.11.0", + "registryURL": "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.13.2", "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.8.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.10.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.6.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.8.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.8.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.10.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.10.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.12.0", + "platforms": [] } }, { diff --git a/charts/tfy-k8s-azure-aks-inframold/templates/prometheus.yaml b/charts/tfy-k8s-azure-aks-inframold/templates/prometheus.yaml index b287462a7..ab6f20f66 100644 --- a/charts/tfy-k8s-azure-aks-inframold/templates/prometheus.yaml +++ b/charts/tfy-k8s-azure-aks-inframold/templates/prometheus.yaml @@ -25,6 +25,8 @@ spec: chart: kube-prometheus-stack helm: values: | + defaultRules: + enabled: false coreDns: enabled: false grafana: @@ -423,42 +425,41 @@ spec: labels: daemonset: 'true' record: kubecost_savings_memory_allocation_bytes - - name: GoogleCadvisor + - name: Alerting rules: - for: 2m expr: >- - (sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) + sum(rate(container_cpu_usage_seconds_total{job="kubelet", container!=""}[5m])) by (pod, container, namespace) / - sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) - by (pod, container, namespace) * 100) > 80 - alert: ContainerHighCpuUtilization + sum(kube_pod_container_resource_requests{job="kube-state-metrics", resource="cpu"}) + by (pod, container, namespace) * 100 > 80 + alert: CPUHighUtilizationForRequested labels: severity: warning annotations: summary: >- Container High CPU utilization (instance {{ `{{ $labels.instance }}` }}) description: |- - Container CPU utilization is above 80% + Container CPU utilization is above 80% of requested CPU VALUE = {{`{{ $value }}`}} - for: 2m expr: >- - (sum(container_memory_working_set_bytes{container!=""}) BY (pod, - container, namespace) / sum(container_spec_memory_limit_bytes > 0) BY (pod, - container, namespace) * 100) > 80 - alert: ContainerHighMemoryUsage + sum(container_memory_working_set_bytes{job="kubelet", container!=""}) by (pod, container, namespace) / + sum(kube_pod_container_resource_requests{job="kube-state-metrics", resource="memory"}) by (pod, container, namespace) * 100 > 80 + alert: MemoryHighUtilizationForRequested labels: severity: warning annotations: summary: Container High Memory usage (instance {{ `{{ $labels.instance }}` }}) description: |- - Container Memory usage is above 80% + Container Memory usage is above 80% of requested memory VALUE = {{`{{ $value }}`}} - for: 5m expr: >- - sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) - by (container, pod, namespace) / - sum(increase(container_cpu_cfs_periods_total[5m])) by - (container, pod, namespace) > ( 25 / 100 ) + sum(increase(container_cpu_cfs_throttled_periods_total{job="kubelet", container!=""}[5m])) + by (pod, container, namespace) / + sum(increase(container_cpu_cfs_periods_total{job="kubelet"}[5m])) by + (pod, container, namespace) > ( 25 / 100 ) alert: ContainerHighThrottleRate labels: severity: warning @@ -467,42 +468,38 @@ spec: description: |- Container is being throttled VALUE = {{`{{ $value }}`}} - - name: KubestateExporter - rules: - for: 2m expr: >- - (kube_horizontalpodautoscaler_spec_max_replicas - - kube_horizontalpodautoscaler_status_desired_replicas) * on - (horizontalpodautoscaler,namespace, pod) - (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", - status="true"} == 1) == 0 - alert: KubernetesHpaScaleInability + sum(rate(container_cpu_usage_seconds_total{job="kubelet", container!=""}[5m])) + by (pod, container, namespace) / + sum(kube_pod_container_resource_limits{job="kube-state-metrics", resource="cpu"}) + by (pod, container, namespace) * 100 > 70 + alert: CPUHighUtilizationForLimit labels: - severity: warning + severity: critical annotations: summary: >- - Kubernetes HPA scale inability (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} is unable to scale + Container High CPU utilization (instance {{ `{{ $labels.instance }}` }}) + description: |- + Container CPU utilization is above 70% of CPU limit VALUE = {{`{{ $value }}`}} - - for: 0m + - for: 2m expr: >- - kube_horizontalpodautoscaler_status_condition{status="false", - condition="ScalingActive"} == 1 - alert: KubernetesHpaMetricsUnavailability + sum(container_memory_working_set_bytes{job="kubelet", container!=""}) by (pod, container, namespace) / + sum(kube_pod_container_resource_limits{job="kube-state-metrics", resource="memory"}) by (pod, container, namespace) * 100 > 70 + alert: MemoryHighUtilizationForLimit labels: - severity: warning + severity: critical annotations: summary: >- - Kubernetes HPA metrics unavailability (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - is unable to collect metrics + Container High Memory usage (instance {{ `{{ $labels.instance }}` }}) + description: |- + Container Memory usage is above 70% of memory limit VALUE = {{`{{ $value }}`}} - for: 15m expr: >- sum by (namespace, pod) - (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0 + (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown|Failed"}) > 0 alert: KubernetesPodNotHealthy labels: severity: critical @@ -514,7 +511,7 @@ spec: non-running state for longer than 15 minutes. VALUE = {{`{{ $value }}`}} - for: 2m - expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 + expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[1m]) > 3 alert: KubernetesPodCrashLooping labels: severity: warning @@ -526,34 +523,86 @@ spec: VALUE = {{`{{ $value }}`}} - for: 2m expr: >- - (kube_horizontalpodautoscaler_status_desired_replicas >= - kube_horizontalpodautoscaler_spec_max_replicas) and - (kube_horizontalpodautoscaler_spec_max_replicas > 1) and - (kube_horizontalpodautoscaler_spec_min_replicas != - kube_horizontalpodautoscaler_spec_max_replicas) - alert: KubernetesHpaScaleMaximum + sum(increase(container_oom_events_total{job="kubelet", container!=""}[5m])) + by (container, pod, namespace) > 0 + alert: ContainerOOMKilled labels: - severity: info + severity: critical annotations: - summary: Kubernetes HPA scale maximum (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - has hit maximum number of desired pods - VALUE = {{`{{ $value }}`}} - - for: 0m + summary: >- + Container OOM Killed ({{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}}/{{`{{ $labels.container }}`}}) + description: |- + Container {{`{{ $labels.container }}`}} in pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} + has been OOM killed in the last 5 minutes + VALUE = {{`{{ $value }}`}} times + - for: 10m expr: >- - max(quantile_over_time(0.5, - kube_horizontalpodautoscaler_status_desired_replicas[1d]) == - kube_horizontalpodautoscaler_spec_min_replicas) by - (horizontalpodautoscaler) > 3 - alert: KubernetesHpaUnderutilized + (sum(kubelet_volume_stats_used_bytes{job="kubelet"}) by (persistentvolumeclaim, namespace) / + sum(kubelet_volume_stats_capacity_bytes{job="kubelet"}) by (persistentvolumeclaim, namespace)) * 100 > 90 + alert: PersistentVolumeUsageHigh labels: - severity: info + severity: critical annotations: - summary: Kubernetes HPA underutilized (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - is constantly at minimum replicas for 50% of the time. - Potential cost saving here. + summary: >- + Persistent Volume usage is above 90% ({{`{{ $labels.namespace }}`}}/{{`{{ $labels.persistentvolumeclaim }}`}}) + description: |- + Persistent Volume {{`{{ $labels.persistentvolumeclaim }}`}} in namespace {{`{{ $labels.namespace }}`}} + is using more than 90% of its capacity. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: (1 - (node_filesystem_avail_bytes{job="node-exporter"} / node_filesystem_size_bytes{job="node-exporter"})) > 0.9 + alert: NodeDiskPressure + labels: + severity: critical + annotations: + summary: >- + Node disk usage is above 90% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing disk pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} < 0.1 + alert: NodeMemoryPressure + labels: + severity: critical + annotations: + summary: >- + Node memory available is below 10% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing memory pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m]) < 0.1 + alert: NodeCPUPressure + labels: + severity: critical + annotations: + summary: >- + Node CPU usage is above 90% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing CPU pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: kube_node_status_condition{job="kube-state-metrics", condition="Ready", status="true"} == 0 + alert: NodeNotReady + labels: + severity: critical + annotations: + summary: >- + Node is not ready ({{`{{ $labels.node }}`}}) + description: |- + Node {{`{{ $labels.node }}`}} is not in a ready state. + VALUE = {{`{{ $value }}`}} + - for: 5m + expr: kube_node_status_condition{job="kube-state-metrics", condition="NetworkUnavailable", status="true"} == 1 + alert: NodeNetworkUnavailable + labels: + severity: critical + annotations: + summary: >- + Node network is unavailable ({{`{{ $labels.node }}`}}) + description: |- + Node {{`{{ $labels.node }}`}} network is unavailable. VALUE = {{`{{ $value }}`}} + {{- end }} diff --git a/charts/tfy-k8s-azure-aks-inframold/templates/tfy-agent.yaml b/charts/tfy-k8s-azure-aks-inframold/templates/tfy-agent.yaml index 45c2a8262..61d6f8ea2 100644 --- a/charts/tfy-k8s-azure-aks-inframold/templates/tfy-agent.yaml +++ b/charts/tfy-k8s-azure-aks-inframold/templates/tfy-agent.yaml @@ -14,7 +14,7 @@ spec: server: https://kubernetes.default.svc project: tfy-apps source: - targetRevision: 0.2.35 + targetRevision: 0.2.42 repoURL: https://truefoundry.github.io/infra-charts/ chart: tfy-agent helm: diff --git a/charts/tfy-k8s-azure-aks-inframold/templates/truefoundry.yaml b/charts/tfy-k8s-azure-aks-inframold/templates/truefoundry.yaml index da91b6a75..03f06ccd8 100644 --- a/charts/tfy-k8s-azure-aks-inframold/templates/truefoundry.yaml +++ b/charts/tfy-k8s-azure-aks-inframold/templates/truefoundry.yaml @@ -14,7 +14,7 @@ spec: server: https://kubernetes.default.svc project: tfy-apps source: - targetRevision: 0.11.1 + targetRevision: 0.13.2 repoURL: "https://truefoundry.github.io/infra-charts" chart: truefoundry helm: diff --git a/charts/tfy-k8s-civo-talos-inframold/Chart.yaml b/charts/tfy-k8s-civo-talos-inframold/Chart.yaml index 6adcd3a3c..f1a3fe435 100644 --- a/charts/tfy-k8s-civo-talos-inframold/Chart.yaml +++ b/charts/tfy-k8s-civo-talos-inframold/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: tfy-k8s-civo-talos-inframold -version: 0.1.11 +version: 0.1.12 description: "Inframold, the superchart that configure your cluster on civo for truefoundry." maintainers: - name: truefoundry diff --git a/charts/tfy-k8s-civo-talos-inframold/templates/prometheus.yaml b/charts/tfy-k8s-civo-talos-inframold/templates/prometheus.yaml index b287462a7..ab6f20f66 100644 --- a/charts/tfy-k8s-civo-talos-inframold/templates/prometheus.yaml +++ b/charts/tfy-k8s-civo-talos-inframold/templates/prometheus.yaml @@ -25,6 +25,8 @@ spec: chart: kube-prometheus-stack helm: values: | + defaultRules: + enabled: false coreDns: enabled: false grafana: @@ -423,42 +425,41 @@ spec: labels: daemonset: 'true' record: kubecost_savings_memory_allocation_bytes - - name: GoogleCadvisor + - name: Alerting rules: - for: 2m expr: >- - (sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) + sum(rate(container_cpu_usage_seconds_total{job="kubelet", container!=""}[5m])) by (pod, container, namespace) / - sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) - by (pod, container, namespace) * 100) > 80 - alert: ContainerHighCpuUtilization + sum(kube_pod_container_resource_requests{job="kube-state-metrics", resource="cpu"}) + by (pod, container, namespace) * 100 > 80 + alert: CPUHighUtilizationForRequested labels: severity: warning annotations: summary: >- Container High CPU utilization (instance {{ `{{ $labels.instance }}` }}) description: |- - Container CPU utilization is above 80% + Container CPU utilization is above 80% of requested CPU VALUE = {{`{{ $value }}`}} - for: 2m expr: >- - (sum(container_memory_working_set_bytes{container!=""}) BY (pod, - container, namespace) / sum(container_spec_memory_limit_bytes > 0) BY (pod, - container, namespace) * 100) > 80 - alert: ContainerHighMemoryUsage + sum(container_memory_working_set_bytes{job="kubelet", container!=""}) by (pod, container, namespace) / + sum(kube_pod_container_resource_requests{job="kube-state-metrics", resource="memory"}) by (pod, container, namespace) * 100 > 80 + alert: MemoryHighUtilizationForRequested labels: severity: warning annotations: summary: Container High Memory usage (instance {{ `{{ $labels.instance }}` }}) description: |- - Container Memory usage is above 80% + Container Memory usage is above 80% of requested memory VALUE = {{`{{ $value }}`}} - for: 5m expr: >- - sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) - by (container, pod, namespace) / - sum(increase(container_cpu_cfs_periods_total[5m])) by - (container, pod, namespace) > ( 25 / 100 ) + sum(increase(container_cpu_cfs_throttled_periods_total{job="kubelet", container!=""}[5m])) + by (pod, container, namespace) / + sum(increase(container_cpu_cfs_periods_total{job="kubelet"}[5m])) by + (pod, container, namespace) > ( 25 / 100 ) alert: ContainerHighThrottleRate labels: severity: warning @@ -467,42 +468,38 @@ spec: description: |- Container is being throttled VALUE = {{`{{ $value }}`}} - - name: KubestateExporter - rules: - for: 2m expr: >- - (kube_horizontalpodautoscaler_spec_max_replicas - - kube_horizontalpodautoscaler_status_desired_replicas) * on - (horizontalpodautoscaler,namespace, pod) - (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", - status="true"} == 1) == 0 - alert: KubernetesHpaScaleInability + sum(rate(container_cpu_usage_seconds_total{job="kubelet", container!=""}[5m])) + by (pod, container, namespace) / + sum(kube_pod_container_resource_limits{job="kube-state-metrics", resource="cpu"}) + by (pod, container, namespace) * 100 > 70 + alert: CPUHighUtilizationForLimit labels: - severity: warning + severity: critical annotations: summary: >- - Kubernetes HPA scale inability (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} is unable to scale + Container High CPU utilization (instance {{ `{{ $labels.instance }}` }}) + description: |- + Container CPU utilization is above 70% of CPU limit VALUE = {{`{{ $value }}`}} - - for: 0m + - for: 2m expr: >- - kube_horizontalpodautoscaler_status_condition{status="false", - condition="ScalingActive"} == 1 - alert: KubernetesHpaMetricsUnavailability + sum(container_memory_working_set_bytes{job="kubelet", container!=""}) by (pod, container, namespace) / + sum(kube_pod_container_resource_limits{job="kube-state-metrics", resource="memory"}) by (pod, container, namespace) * 100 > 70 + alert: MemoryHighUtilizationForLimit labels: - severity: warning + severity: critical annotations: summary: >- - Kubernetes HPA metrics unavailability (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - is unable to collect metrics + Container High Memory usage (instance {{ `{{ $labels.instance }}` }}) + description: |- + Container Memory usage is above 70% of memory limit VALUE = {{`{{ $value }}`}} - for: 15m expr: >- sum by (namespace, pod) - (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0 + (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown|Failed"}) > 0 alert: KubernetesPodNotHealthy labels: severity: critical @@ -514,7 +511,7 @@ spec: non-running state for longer than 15 minutes. VALUE = {{`{{ $value }}`}} - for: 2m - expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 + expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[1m]) > 3 alert: KubernetesPodCrashLooping labels: severity: warning @@ -526,34 +523,86 @@ spec: VALUE = {{`{{ $value }}`}} - for: 2m expr: >- - (kube_horizontalpodautoscaler_status_desired_replicas >= - kube_horizontalpodautoscaler_spec_max_replicas) and - (kube_horizontalpodautoscaler_spec_max_replicas > 1) and - (kube_horizontalpodautoscaler_spec_min_replicas != - kube_horizontalpodautoscaler_spec_max_replicas) - alert: KubernetesHpaScaleMaximum + sum(increase(container_oom_events_total{job="kubelet", container!=""}[5m])) + by (container, pod, namespace) > 0 + alert: ContainerOOMKilled labels: - severity: info + severity: critical annotations: - summary: Kubernetes HPA scale maximum (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - has hit maximum number of desired pods - VALUE = {{`{{ $value }}`}} - - for: 0m + summary: >- + Container OOM Killed ({{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}}/{{`{{ $labels.container }}`}}) + description: |- + Container {{`{{ $labels.container }}`}} in pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} + has been OOM killed in the last 5 minutes + VALUE = {{`{{ $value }}`}} times + - for: 10m expr: >- - max(quantile_over_time(0.5, - kube_horizontalpodautoscaler_status_desired_replicas[1d]) == - kube_horizontalpodautoscaler_spec_min_replicas) by - (horizontalpodautoscaler) > 3 - alert: KubernetesHpaUnderutilized + (sum(kubelet_volume_stats_used_bytes{job="kubelet"}) by (persistentvolumeclaim, namespace) / + sum(kubelet_volume_stats_capacity_bytes{job="kubelet"}) by (persistentvolumeclaim, namespace)) * 100 > 90 + alert: PersistentVolumeUsageHigh labels: - severity: info + severity: critical annotations: - summary: Kubernetes HPA underutilized (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - is constantly at minimum replicas for 50% of the time. - Potential cost saving here. + summary: >- + Persistent Volume usage is above 90% ({{`{{ $labels.namespace }}`}}/{{`{{ $labels.persistentvolumeclaim }}`}}) + description: |- + Persistent Volume {{`{{ $labels.persistentvolumeclaim }}`}} in namespace {{`{{ $labels.namespace }}`}} + is using more than 90% of its capacity. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: (1 - (node_filesystem_avail_bytes{job="node-exporter"} / node_filesystem_size_bytes{job="node-exporter"})) > 0.9 + alert: NodeDiskPressure + labels: + severity: critical + annotations: + summary: >- + Node disk usage is above 90% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing disk pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} < 0.1 + alert: NodeMemoryPressure + labels: + severity: critical + annotations: + summary: >- + Node memory available is below 10% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing memory pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m]) < 0.1 + alert: NodeCPUPressure + labels: + severity: critical + annotations: + summary: >- + Node CPU usage is above 90% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing CPU pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: kube_node_status_condition{job="kube-state-metrics", condition="Ready", status="true"} == 0 + alert: NodeNotReady + labels: + severity: critical + annotations: + summary: >- + Node is not ready ({{`{{ $labels.node }}`}}) + description: |- + Node {{`{{ $labels.node }}`}} is not in a ready state. + VALUE = {{`{{ $value }}`}} + - for: 5m + expr: kube_node_status_condition{job="kube-state-metrics", condition="NetworkUnavailable", status="true"} == 1 + alert: NodeNetworkUnavailable + labels: + severity: critical + annotations: + summary: >- + Node network is unavailable ({{`{{ $labels.node }}`}}) + description: |- + Node {{`{{ $labels.node }}`}} network is unavailable. VALUE = {{`{{ $value }}`}} + {{- end }} diff --git a/charts/tfy-k8s-civo-talos-inframold/templates/tfy-agent.yaml b/charts/tfy-k8s-civo-talos-inframold/templates/tfy-agent.yaml index 45c2a8262..61d6f8ea2 100644 --- a/charts/tfy-k8s-civo-talos-inframold/templates/tfy-agent.yaml +++ b/charts/tfy-k8s-civo-talos-inframold/templates/tfy-agent.yaml @@ -14,7 +14,7 @@ spec: server: https://kubernetes.default.svc project: tfy-apps source: - targetRevision: 0.2.35 + targetRevision: 0.2.42 repoURL: https://truefoundry.github.io/infra-charts/ chart: tfy-agent helm: diff --git a/charts/tfy-k8s-civo-talos-inframold/templates/truefoundry.yaml b/charts/tfy-k8s-civo-talos-inframold/templates/truefoundry.yaml index da91b6a75..03f06ccd8 100644 --- a/charts/tfy-k8s-civo-talos-inframold/templates/truefoundry.yaml +++ b/charts/tfy-k8s-civo-talos-inframold/templates/truefoundry.yaml @@ -14,7 +14,7 @@ spec: server: https://kubernetes.default.svc project: tfy-apps source: - targetRevision: 0.11.1 + targetRevision: 0.13.2 repoURL: "https://truefoundry.github.io/infra-charts" chart: truefoundry helm: diff --git a/charts/tfy-k8s-gcp-gke-standard-inframold/Chart.yaml b/charts/tfy-k8s-gcp-gke-standard-inframold/Chart.yaml index 7491001f6..e60d6ea15 100644 --- a/charts/tfy-k8s-gcp-gke-standard-inframold/Chart.yaml +++ b/charts/tfy-k8s-gcp-gke-standard-inframold/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: tfy-k8s-gcp-gke-standard-inframold -version: 0.1.11 +version: 0.1.12 description: "Inframold, the superchart that configure your cluster on gcp for truefoundry." maintainers: - name: truefoundry diff --git a/charts/tfy-k8s-gcp-gke-standard-inframold/artifacts-manifest.json b/charts/tfy-k8s-gcp-gke-standard-inframold/artifacts-manifest.json index b6db31cde..a5d4c1245 100644 --- a/charts/tfy-k8s-gcp-gke-standard-inframold/artifacts-manifest.json +++ b/charts/tfy-k8s-gcp-gke-standard-inframold/artifacts-manifest.json @@ -163,10 +163,10 @@ "details": { "chart": "tfy-agent", "repoURL": "https://truefoundry.github.io/infra-charts/", - "targetRevision": "0.2.35", + "targetRevision": "0.2.42", "images": [ - "tfy.jfrog.io/tfy-images/tfy-agent:c610a56137336ab818510cb42680add0ae3db54e", - "tfy.jfrog.io/tfy-images/tfy-agent-proxy:0823e317799add6beaaa4037b81068f6c25f3bf7" + "tfy.jfrog.io/tfy-images/tfy-agent:abdd060d96379a09bed4d6c2ab7516a11e154bfa", + "tfy.jfrog.io/tfy-images/tfy-agent-proxy:fcfb8e398eb01f05fb72cd9115b9ec69a89b9cce" ] } }, @@ -198,14 +198,14 @@ "details": { "chart": "truefoundry", "repoURL": "https://truefoundry.github.io/infra-charts", - "targetRevision": "0.11.1", + "targetRevision": "0.13.2", "images": [ - "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.8.1", - "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.11.0", - "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.8.0", - "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.6.0", - "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.8.0", - "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.10.0", + "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.10.0", + "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.13.2", + "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.10.0", + "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.8.0", + "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.10.0", + "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.12.0", "tfy.jfrog.io/tfy-mirror/nats:2.10.21-alpine3.20", "tfy.jfrog.io/tfy-mirror/natsio/nats-server-config-reloader:0.14.3", "tfy.jfrog.io/tfy-mirror/natsio/prometheus-nats-exporter:0.15.0", @@ -852,33 +852,15 @@ { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent:c610a56137336ab818510cb42680add0ae3db54e", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent:abdd060d96379a09bed4d6c2ab7516a11e154bfa", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent-proxy:0823e317799add6beaaa4037b81068f6c25f3bf7", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent-proxy:fcfb8e398eb01f05fb72cd9115b9ec69a89b9cce", + "platforms": [] } }, { @@ -930,88 +912,43 @@ { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.8.1", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.10.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.11.0", + "registryURL": "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.13.2", "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.8.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.10.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.6.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.8.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.8.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.10.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.10.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.12.0", + "platforms": [] } }, { diff --git a/charts/tfy-k8s-gcp-gke-standard-inframold/templates/prometheus.yaml b/charts/tfy-k8s-gcp-gke-standard-inframold/templates/prometheus.yaml index b287462a7..ab6f20f66 100644 --- a/charts/tfy-k8s-gcp-gke-standard-inframold/templates/prometheus.yaml +++ b/charts/tfy-k8s-gcp-gke-standard-inframold/templates/prometheus.yaml @@ -25,6 +25,8 @@ spec: chart: kube-prometheus-stack helm: values: | + defaultRules: + enabled: false coreDns: enabled: false grafana: @@ -423,42 +425,41 @@ spec: labels: daemonset: 'true' record: kubecost_savings_memory_allocation_bytes - - name: GoogleCadvisor + - name: Alerting rules: - for: 2m expr: >- - (sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) + sum(rate(container_cpu_usage_seconds_total{job="kubelet", container!=""}[5m])) by (pod, container, namespace) / - sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) - by (pod, container, namespace) * 100) > 80 - alert: ContainerHighCpuUtilization + sum(kube_pod_container_resource_requests{job="kube-state-metrics", resource="cpu"}) + by (pod, container, namespace) * 100 > 80 + alert: CPUHighUtilizationForRequested labels: severity: warning annotations: summary: >- Container High CPU utilization (instance {{ `{{ $labels.instance }}` }}) description: |- - Container CPU utilization is above 80% + Container CPU utilization is above 80% of requested CPU VALUE = {{`{{ $value }}`}} - for: 2m expr: >- - (sum(container_memory_working_set_bytes{container!=""}) BY (pod, - container, namespace) / sum(container_spec_memory_limit_bytes > 0) BY (pod, - container, namespace) * 100) > 80 - alert: ContainerHighMemoryUsage + sum(container_memory_working_set_bytes{job="kubelet", container!=""}) by (pod, container, namespace) / + sum(kube_pod_container_resource_requests{job="kube-state-metrics", resource="memory"}) by (pod, container, namespace) * 100 > 80 + alert: MemoryHighUtilizationForRequested labels: severity: warning annotations: summary: Container High Memory usage (instance {{ `{{ $labels.instance }}` }}) description: |- - Container Memory usage is above 80% + Container Memory usage is above 80% of requested memory VALUE = {{`{{ $value }}`}} - for: 5m expr: >- - sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) - by (container, pod, namespace) / - sum(increase(container_cpu_cfs_periods_total[5m])) by - (container, pod, namespace) > ( 25 / 100 ) + sum(increase(container_cpu_cfs_throttled_periods_total{job="kubelet", container!=""}[5m])) + by (pod, container, namespace) / + sum(increase(container_cpu_cfs_periods_total{job="kubelet"}[5m])) by + (pod, container, namespace) > ( 25 / 100 ) alert: ContainerHighThrottleRate labels: severity: warning @@ -467,42 +468,38 @@ spec: description: |- Container is being throttled VALUE = {{`{{ $value }}`}} - - name: KubestateExporter - rules: - for: 2m expr: >- - (kube_horizontalpodautoscaler_spec_max_replicas - - kube_horizontalpodautoscaler_status_desired_replicas) * on - (horizontalpodautoscaler,namespace, pod) - (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", - status="true"} == 1) == 0 - alert: KubernetesHpaScaleInability + sum(rate(container_cpu_usage_seconds_total{job="kubelet", container!=""}[5m])) + by (pod, container, namespace) / + sum(kube_pod_container_resource_limits{job="kube-state-metrics", resource="cpu"}) + by (pod, container, namespace) * 100 > 70 + alert: CPUHighUtilizationForLimit labels: - severity: warning + severity: critical annotations: summary: >- - Kubernetes HPA scale inability (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} is unable to scale + Container High CPU utilization (instance {{ `{{ $labels.instance }}` }}) + description: |- + Container CPU utilization is above 70% of CPU limit VALUE = {{`{{ $value }}`}} - - for: 0m + - for: 2m expr: >- - kube_horizontalpodautoscaler_status_condition{status="false", - condition="ScalingActive"} == 1 - alert: KubernetesHpaMetricsUnavailability + sum(container_memory_working_set_bytes{job="kubelet", container!=""}) by (pod, container, namespace) / + sum(kube_pod_container_resource_limits{job="kube-state-metrics", resource="memory"}) by (pod, container, namespace) * 100 > 70 + alert: MemoryHighUtilizationForLimit labels: - severity: warning + severity: critical annotations: summary: >- - Kubernetes HPA metrics unavailability (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - is unable to collect metrics + Container High Memory usage (instance {{ `{{ $labels.instance }}` }}) + description: |- + Container Memory usage is above 70% of memory limit VALUE = {{`{{ $value }}`}} - for: 15m expr: >- sum by (namespace, pod) - (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0 + (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown|Failed"}) > 0 alert: KubernetesPodNotHealthy labels: severity: critical @@ -514,7 +511,7 @@ spec: non-running state for longer than 15 minutes. VALUE = {{`{{ $value }}`}} - for: 2m - expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 + expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[1m]) > 3 alert: KubernetesPodCrashLooping labels: severity: warning @@ -526,34 +523,86 @@ spec: VALUE = {{`{{ $value }}`}} - for: 2m expr: >- - (kube_horizontalpodautoscaler_status_desired_replicas >= - kube_horizontalpodautoscaler_spec_max_replicas) and - (kube_horizontalpodautoscaler_spec_max_replicas > 1) and - (kube_horizontalpodautoscaler_spec_min_replicas != - kube_horizontalpodautoscaler_spec_max_replicas) - alert: KubernetesHpaScaleMaximum + sum(increase(container_oom_events_total{job="kubelet", container!=""}[5m])) + by (container, pod, namespace) > 0 + alert: ContainerOOMKilled labels: - severity: info + severity: critical annotations: - summary: Kubernetes HPA scale maximum (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - has hit maximum number of desired pods - VALUE = {{`{{ $value }}`}} - - for: 0m + summary: >- + Container OOM Killed ({{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}}/{{`{{ $labels.container }}`}}) + description: |- + Container {{`{{ $labels.container }}`}} in pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} + has been OOM killed in the last 5 minutes + VALUE = {{`{{ $value }}`}} times + - for: 10m expr: >- - max(quantile_over_time(0.5, - kube_horizontalpodautoscaler_status_desired_replicas[1d]) == - kube_horizontalpodautoscaler_spec_min_replicas) by - (horizontalpodautoscaler) > 3 - alert: KubernetesHpaUnderutilized + (sum(kubelet_volume_stats_used_bytes{job="kubelet"}) by (persistentvolumeclaim, namespace) / + sum(kubelet_volume_stats_capacity_bytes{job="kubelet"}) by (persistentvolumeclaim, namespace)) * 100 > 90 + alert: PersistentVolumeUsageHigh labels: - severity: info + severity: critical annotations: - summary: Kubernetes HPA underutilized (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - is constantly at minimum replicas for 50% of the time. - Potential cost saving here. + summary: >- + Persistent Volume usage is above 90% ({{`{{ $labels.namespace }}`}}/{{`{{ $labels.persistentvolumeclaim }}`}}) + description: |- + Persistent Volume {{`{{ $labels.persistentvolumeclaim }}`}} in namespace {{`{{ $labels.namespace }}`}} + is using more than 90% of its capacity. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: (1 - (node_filesystem_avail_bytes{job="node-exporter"} / node_filesystem_size_bytes{job="node-exporter"})) > 0.9 + alert: NodeDiskPressure + labels: + severity: critical + annotations: + summary: >- + Node disk usage is above 90% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing disk pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} < 0.1 + alert: NodeMemoryPressure + labels: + severity: critical + annotations: + summary: >- + Node memory available is below 10% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing memory pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m]) < 0.1 + alert: NodeCPUPressure + labels: + severity: critical + annotations: + summary: >- + Node CPU usage is above 90% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing CPU pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: kube_node_status_condition{job="kube-state-metrics", condition="Ready", status="true"} == 0 + alert: NodeNotReady + labels: + severity: critical + annotations: + summary: >- + Node is not ready ({{`{{ $labels.node }}`}}) + description: |- + Node {{`{{ $labels.node }}`}} is not in a ready state. + VALUE = {{`{{ $value }}`}} + - for: 5m + expr: kube_node_status_condition{job="kube-state-metrics", condition="NetworkUnavailable", status="true"} == 1 + alert: NodeNetworkUnavailable + labels: + severity: critical + annotations: + summary: >- + Node network is unavailable ({{`{{ $labels.node }}`}}) + description: |- + Node {{`{{ $labels.node }}`}} network is unavailable. VALUE = {{`{{ $value }}`}} + {{- end }} diff --git a/charts/tfy-k8s-gcp-gke-standard-inframold/templates/tfy-agent.yaml b/charts/tfy-k8s-gcp-gke-standard-inframold/templates/tfy-agent.yaml index 45c2a8262..61d6f8ea2 100644 --- a/charts/tfy-k8s-gcp-gke-standard-inframold/templates/tfy-agent.yaml +++ b/charts/tfy-k8s-gcp-gke-standard-inframold/templates/tfy-agent.yaml @@ -14,7 +14,7 @@ spec: server: https://kubernetes.default.svc project: tfy-apps source: - targetRevision: 0.2.35 + targetRevision: 0.2.42 repoURL: https://truefoundry.github.io/infra-charts/ chart: tfy-agent helm: diff --git a/charts/tfy-k8s-gcp-gke-standard-inframold/templates/truefoundry.yaml b/charts/tfy-k8s-gcp-gke-standard-inframold/templates/truefoundry.yaml index da91b6a75..03f06ccd8 100644 --- a/charts/tfy-k8s-gcp-gke-standard-inframold/templates/truefoundry.yaml +++ b/charts/tfy-k8s-gcp-gke-standard-inframold/templates/truefoundry.yaml @@ -14,7 +14,7 @@ spec: server: https://kubernetes.default.svc project: tfy-apps source: - targetRevision: 0.11.1 + targetRevision: 0.13.2 repoURL: "https://truefoundry.github.io/infra-charts" chart: truefoundry helm: diff --git a/charts/tfy-k8s-generic-inframold/Chart.yaml b/charts/tfy-k8s-generic-inframold/Chart.yaml index 041b6cd6f..25cee1ac6 100644 --- a/charts/tfy-k8s-generic-inframold/Chart.yaml +++ b/charts/tfy-k8s-generic-inframold/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: tfy-k8s-generic-inframold -version: 0.1.11 +version: 0.1.12 description: "Inframold, the superchart that configure your cluster on generic for truefoundry." maintainers: - name: truefoundry diff --git a/charts/tfy-k8s-generic-inframold/artifacts-manifest.json b/charts/tfy-k8s-generic-inframold/artifacts-manifest.json index fb0358a9f..f506cc6df 100644 --- a/charts/tfy-k8s-generic-inframold/artifacts-manifest.json +++ b/charts/tfy-k8s-generic-inframold/artifacts-manifest.json @@ -150,10 +150,10 @@ "details": { "chart": "tfy-agent", "repoURL": "https://truefoundry.github.io/infra-charts/", - "targetRevision": "0.2.35", + "targetRevision": "0.2.42", "images": [ - "tfy.jfrog.io/tfy-images/tfy-agent:c610a56137336ab818510cb42680add0ae3db54e", - "tfy.jfrog.io/tfy-images/tfy-agent-proxy:0823e317799add6beaaa4037b81068f6c25f3bf7" + "tfy.jfrog.io/tfy-images/tfy-agent:abdd060d96379a09bed4d6c2ab7516a11e154bfa", + "tfy.jfrog.io/tfy-images/tfy-agent-proxy:fcfb8e398eb01f05fb72cd9115b9ec69a89b9cce" ] } }, @@ -183,14 +183,14 @@ "details": { "chart": "truefoundry", "repoURL": "https://truefoundry.github.io/infra-charts", - "targetRevision": "0.11.1", + "targetRevision": "0.13.2", "images": [ - "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.8.1", - "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.11.0", - "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.8.0", - "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.6.0", - "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.8.0", - "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.10.0", + "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.10.0", + "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.13.2", + "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.10.0", + "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.8.0", + "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.10.0", + "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.12.0", "tfy.jfrog.io/tfy-mirror/nats:2.10.21-alpine3.20", "tfy.jfrog.io/tfy-mirror/natsio/nats-server-config-reloader:0.14.3", "tfy.jfrog.io/tfy-mirror/natsio/prometheus-nats-exporter:0.15.0", @@ -789,33 +789,15 @@ { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent:c610a56137336ab818510cb42680add0ae3db54e", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent:abdd060d96379a09bed4d6c2ab7516a11e154bfa", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent-proxy:0823e317799add6beaaa4037b81068f6c25f3bf7", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-images/tfy-agent-proxy:fcfb8e398eb01f05fb72cd9115b9ec69a89b9cce", + "platforms": [] } }, { @@ -853,88 +835,43 @@ { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.8.1", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/mlfoundry-server:v0.10.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.11.0", + "registryURL": "tfy.jfrog.io/tfy-private-images/servicefoundry-server:v0.13.2", "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.8.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/sfy-manifest-service:v0.10.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.6.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-controller:v0.8.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.8.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller:v0.10.0", + "platforms": [] } }, { "type": "image", "details": { - "registryURL": "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.10.0", - "platforms": [ - { - "os": "linux", - "architecture": "amd64" - }, - { - "os": "linux", - "architecture": "arm64" - } - ] + "registryURL": "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app:v0.12.0", + "platforms": [] } }, { diff --git a/charts/tfy-k8s-generic-inframold/templates/prometheus.yaml b/charts/tfy-k8s-generic-inframold/templates/prometheus.yaml index b287462a7..ab6f20f66 100644 --- a/charts/tfy-k8s-generic-inframold/templates/prometheus.yaml +++ b/charts/tfy-k8s-generic-inframold/templates/prometheus.yaml @@ -25,6 +25,8 @@ spec: chart: kube-prometheus-stack helm: values: | + defaultRules: + enabled: false coreDns: enabled: false grafana: @@ -423,42 +425,41 @@ spec: labels: daemonset: 'true' record: kubecost_savings_memory_allocation_bytes - - name: GoogleCadvisor + - name: Alerting rules: - for: 2m expr: >- - (sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) + sum(rate(container_cpu_usage_seconds_total{job="kubelet", container!=""}[5m])) by (pod, container, namespace) / - sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) - by (pod, container, namespace) * 100) > 80 - alert: ContainerHighCpuUtilization + sum(kube_pod_container_resource_requests{job="kube-state-metrics", resource="cpu"}) + by (pod, container, namespace) * 100 > 80 + alert: CPUHighUtilizationForRequested labels: severity: warning annotations: summary: >- Container High CPU utilization (instance {{ `{{ $labels.instance }}` }}) description: |- - Container CPU utilization is above 80% + Container CPU utilization is above 80% of requested CPU VALUE = {{`{{ $value }}`}} - for: 2m expr: >- - (sum(container_memory_working_set_bytes{container!=""}) BY (pod, - container, namespace) / sum(container_spec_memory_limit_bytes > 0) BY (pod, - container, namespace) * 100) > 80 - alert: ContainerHighMemoryUsage + sum(container_memory_working_set_bytes{job="kubelet", container!=""}) by (pod, container, namespace) / + sum(kube_pod_container_resource_requests{job="kube-state-metrics", resource="memory"}) by (pod, container, namespace) * 100 > 80 + alert: MemoryHighUtilizationForRequested labels: severity: warning annotations: summary: Container High Memory usage (instance {{ `{{ $labels.instance }}` }}) description: |- - Container Memory usage is above 80% + Container Memory usage is above 80% of requested memory VALUE = {{`{{ $value }}`}} - for: 5m expr: >- - sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) - by (container, pod, namespace) / - sum(increase(container_cpu_cfs_periods_total[5m])) by - (container, pod, namespace) > ( 25 / 100 ) + sum(increase(container_cpu_cfs_throttled_periods_total{job="kubelet", container!=""}[5m])) + by (pod, container, namespace) / + sum(increase(container_cpu_cfs_periods_total{job="kubelet"}[5m])) by + (pod, container, namespace) > ( 25 / 100 ) alert: ContainerHighThrottleRate labels: severity: warning @@ -467,42 +468,38 @@ spec: description: |- Container is being throttled VALUE = {{`{{ $value }}`}} - - name: KubestateExporter - rules: - for: 2m expr: >- - (kube_horizontalpodautoscaler_spec_max_replicas - - kube_horizontalpodautoscaler_status_desired_replicas) * on - (horizontalpodautoscaler,namespace, pod) - (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", - status="true"} == 1) == 0 - alert: KubernetesHpaScaleInability + sum(rate(container_cpu_usage_seconds_total{job="kubelet", container!=""}[5m])) + by (pod, container, namespace) / + sum(kube_pod_container_resource_limits{job="kube-state-metrics", resource="cpu"}) + by (pod, container, namespace) * 100 > 70 + alert: CPUHighUtilizationForLimit labels: - severity: warning + severity: critical annotations: summary: >- - Kubernetes HPA scale inability (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} is unable to scale + Container High CPU utilization (instance {{ `{{ $labels.instance }}` }}) + description: |- + Container CPU utilization is above 70% of CPU limit VALUE = {{`{{ $value }}`}} - - for: 0m + - for: 2m expr: >- - kube_horizontalpodautoscaler_status_condition{status="false", - condition="ScalingActive"} == 1 - alert: KubernetesHpaMetricsUnavailability + sum(container_memory_working_set_bytes{job="kubelet", container!=""}) by (pod, container, namespace) / + sum(kube_pod_container_resource_limits{job="kube-state-metrics", resource="memory"}) by (pod, container, namespace) * 100 > 70 + alert: MemoryHighUtilizationForLimit labels: - severity: warning + severity: critical annotations: summary: >- - Kubernetes HPA metrics unavailability (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - is unable to collect metrics + Container High Memory usage (instance {{ `{{ $labels.instance }}` }}) + description: |- + Container Memory usage is above 70% of memory limit VALUE = {{`{{ $value }}`}} - for: 15m expr: >- sum by (namespace, pod) - (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0 + (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown|Failed"}) > 0 alert: KubernetesPodNotHealthy labels: severity: critical @@ -514,7 +511,7 @@ spec: non-running state for longer than 15 minutes. VALUE = {{`{{ $value }}`}} - for: 2m - expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 + expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[1m]) > 3 alert: KubernetesPodCrashLooping labels: severity: warning @@ -526,34 +523,86 @@ spec: VALUE = {{`{{ $value }}`}} - for: 2m expr: >- - (kube_horizontalpodautoscaler_status_desired_replicas >= - kube_horizontalpodautoscaler_spec_max_replicas) and - (kube_horizontalpodautoscaler_spec_max_replicas > 1) and - (kube_horizontalpodautoscaler_spec_min_replicas != - kube_horizontalpodautoscaler_spec_max_replicas) - alert: KubernetesHpaScaleMaximum + sum(increase(container_oom_events_total{job="kubelet", container!=""}[5m])) + by (container, pod, namespace) > 0 + alert: ContainerOOMKilled labels: - severity: info + severity: critical annotations: - summary: Kubernetes HPA scale maximum (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - has hit maximum number of desired pods - VALUE = {{`{{ $value }}`}} - - for: 0m + summary: >- + Container OOM Killed ({{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}}/{{`{{ $labels.container }}`}}) + description: |- + Container {{`{{ $labels.container }}`}} in pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} + has been OOM killed in the last 5 minutes + VALUE = {{`{{ $value }}`}} times + - for: 10m expr: >- - max(quantile_over_time(0.5, - kube_horizontalpodautoscaler_status_desired_replicas[1d]) == - kube_horizontalpodautoscaler_spec_min_replicas) by - (horizontalpodautoscaler) > 3 - alert: KubernetesHpaUnderutilized + (sum(kubelet_volume_stats_used_bytes{job="kubelet"}) by (persistentvolumeclaim, namespace) / + sum(kubelet_volume_stats_capacity_bytes{job="kubelet"}) by (persistentvolumeclaim, namespace)) * 100 > 90 + alert: PersistentVolumeUsageHigh labels: - severity: info + severity: critical annotations: - summary: Kubernetes HPA underutilized (instance {{ `{{ $labels.instance }}` }}) - description: >- - HPA {{`{{ $labels.namespace }}`}}/{{`{{ $labels.horizontalpodautoscaler }}`}} - is constantly at minimum replicas for 50% of the time. - Potential cost saving here. + summary: >- + Persistent Volume usage is above 90% ({{`{{ $labels.namespace }}`}}/{{`{{ $labels.persistentvolumeclaim }}`}}) + description: |- + Persistent Volume {{`{{ $labels.persistentvolumeclaim }}`}} in namespace {{`{{ $labels.namespace }}`}} + is using more than 90% of its capacity. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: (1 - (node_filesystem_avail_bytes{job="node-exporter"} / node_filesystem_size_bytes{job="node-exporter"})) > 0.9 + alert: NodeDiskPressure + labels: + severity: critical + annotations: + summary: >- + Node disk usage is above 90% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing disk pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} < 0.1 + alert: NodeMemoryPressure + labels: + severity: critical + annotations: + summary: >- + Node memory available is below 10% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing memory pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m]) < 0.1 + alert: NodeCPUPressure + labels: + severity: critical + annotations: + summary: >- + Node CPU usage is above 90% ({{`{{ $labels.instance }}`}}) + description: |- + Node {{`{{ $labels.instance }}`}} is experiencing CPU pressure. + VALUE = {{`{{ $value }}`}}% + - for: 5m + expr: kube_node_status_condition{job="kube-state-metrics", condition="Ready", status="true"} == 0 + alert: NodeNotReady + labels: + severity: critical + annotations: + summary: >- + Node is not ready ({{`{{ $labels.node }}`}}) + description: |- + Node {{`{{ $labels.node }}`}} is not in a ready state. + VALUE = {{`{{ $value }}`}} + - for: 5m + expr: kube_node_status_condition{job="kube-state-metrics", condition="NetworkUnavailable", status="true"} == 1 + alert: NodeNetworkUnavailable + labels: + severity: critical + annotations: + summary: >- + Node network is unavailable ({{`{{ $labels.node }}`}}) + description: |- + Node {{`{{ $labels.node }}`}} network is unavailable. VALUE = {{`{{ $value }}`}} + {{- end }} diff --git a/charts/tfy-k8s-generic-inframold/templates/tfy-agent.yaml b/charts/tfy-k8s-generic-inframold/templates/tfy-agent.yaml index 45c2a8262..61d6f8ea2 100644 --- a/charts/tfy-k8s-generic-inframold/templates/tfy-agent.yaml +++ b/charts/tfy-k8s-generic-inframold/templates/tfy-agent.yaml @@ -14,7 +14,7 @@ spec: server: https://kubernetes.default.svc project: tfy-apps source: - targetRevision: 0.2.35 + targetRevision: 0.2.42 repoURL: https://truefoundry.github.io/infra-charts/ chart: tfy-agent helm: diff --git a/charts/tfy-k8s-generic-inframold/templates/truefoundry.yaml b/charts/tfy-k8s-generic-inframold/templates/truefoundry.yaml index da91b6a75..03f06ccd8 100644 --- a/charts/tfy-k8s-generic-inframold/templates/truefoundry.yaml +++ b/charts/tfy-k8s-generic-inframold/templates/truefoundry.yaml @@ -14,7 +14,7 @@ spec: server: https://kubernetes.default.svc project: tfy-apps source: - targetRevision: 0.11.1 + targetRevision: 0.13.2 repoURL: "https://truefoundry.github.io/infra-charts" chart: truefoundry helm: diff --git a/charts/tfy-llm-gateway-infra/Chart.yaml b/charts/tfy-llm-gateway-infra/Chart.yaml index 36a5dd579..29e896d16 100644 --- a/charts/tfy-llm-gateway-infra/Chart.yaml +++ b/charts/tfy-llm-gateway-infra/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: tfy-llm-gateway-infra -version: 0.1.0 +version: 0.1.1 description: "TrueFoundry LLM Gateway infra stack chart" maintainers: - name: truefoundry diff --git a/charts/tfy-llm-gateway-infra/values.yaml b/charts/tfy-llm-gateway-infra/values.yaml index f253e5329..ffdcb2798 100644 --- a/charts/tfy-llm-gateway-infra/values.yaml +++ b/charts/tfy-llm-gateway-infra/values.yaml @@ -207,7 +207,7 @@ nats: ## @skip clickhouse [object] clickhouse config clickhouse: - enabled: false + enabled: true clusterName: clickhouse user: user shardsCount: 1 diff --git a/charts/tfy-llm-gateway/Chart.yaml b/charts/tfy-llm-gateway/Chart.yaml index 49dca5722..9bdc67cb3 100644 --- a/charts/tfy-llm-gateway/Chart.yaml +++ b/charts/tfy-llm-gateway/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: tfy-llm-gateway -version: 0.9.0 +version: 0.10.1 description: "Truefoundry LLM Gateway deployment chart" maintainers: - name: truefoundry diff --git a/charts/tfy-llm-gateway/README.md b/charts/tfy-llm-gateway/README.md index bd6513cb5..7de2fcd76 100644 --- a/charts/tfy-llm-gateway/README.md +++ b/charts/tfy-llm-gateway/README.md @@ -5,61 +5,65 @@ LLM-Gateway Helm Chart ### Configuration for LLM Gateway -| Name | Description | Value | -| -------------------------------------------- | ---------------------------------- | ------------------------------------------------- | -| `global` | Truefoundry global values | `{}` | -| `image.repository` | Image repository for tfyLLMGateway | `tfy.jfrog.io/tfy-private-images/tfy-llm-gateway` | -| `image.tag` | Image tag for the tfyLLMGateway | `510cb55e8ff708cbc0b0fbdf02ea9b104bbdc846` | -| `replicaCount` | Number of replicas | `1` | -| `environmentName` | The environment name | `default` | -| `envSecretName` | The environment secret name | `tfy-llm-gateway-env-secret` | -| `imagePullPolicy` | Image pull policy | `IfNotPresent` | -| `nameOverride` | Name override | `""` | -| `fullnameOverride` | Fullname override | `""` | -| `podAnnotations` | Pod annotations | `{}` | -| `podSecurityContext` | Pod security context | `{}` | -| `commonLabels` | Common labels | `{}` | -| `securityContext` | Security context configuration | `{}` | -| `healthcheck.enabled` | Enable healthcheck | `true` | -| `healthcheck.readiness.port` | Port to probe | `8787` | -| `healthcheck.readiness.path` | Path to probe | `/` | -| `healthcheck.readiness.initialDelaySeconds` | Initial delay in seconds | `10` | -| `healthcheck.readiness.periodSeconds` | Period in seconds | `10` | -| `healthcheck.readiness.timeoutSeconds` | Timeout in seconds | `5` | -| `healthcheck.readiness.successThreshold` | Success threshold | `1` | -| `healthcheck.readiness.failureThreshold` | Failure threshold | `3` | -| `healthcheck.liveness.port` | Port to probe | `8787` | -| `healthcheck.liveness.path` | Path to probe | `/` | -| `resources.limits.cpu` | CPU limit | `2` | -| `resources.limits.memory` | Memory limit | `1024Mi` | -| `resources.limits.ephemeral-storage` | Ephemeral storage limit | `512Mi` | -| `resources.requests.cpu` | CPU request | `1` | -| `resources.requests.memory` | Memory request | `512Mi` | -| `resources.requests.ephemeral-storage` | Ephemeral storage request | `256Mi` | -| `nodeSelector` | Node selector | `{}` | -| `tolerations` | Tolerations | `{}` | -| `affinity` | Affinity | `{}` | -| `topologySpreadConstraints` | Topology spread constraints | `{}` | -| `ingress.enabled` | Enable ingress configuration | `false` | -| `ingress.annotations` | Ingress annotations | `{}` | -| `ingress.labels` | Ingress labels | `{}` | -| `ingress.ingressClassName` | Ingress class name | `istio` | -| `ingress.tls` | Ingress TLS configuration | `[]` | -| `ingress.hosts` | Ingress hosts | `[]` | -| `istio.virtualservice.enabled` | Enable virtual service | `false` | -| `istio.virtualservice.annotations` | Virtual service annotations | `{}` | -| `istio.virtualservice.gateways` | Virtual service gateways | `[]` | -| `istio.virtualservice.hosts` | Virtual service hosts | `[]` | -| `service.type` | Service type | `ClusterIP` | -| `service.port` | Service port | `8787` | -| `service.annotations` | Service annotations | `{}` | -| `serviceAccount.create` | Create service account | `true` | -| `serviceAccount.annotations` | Service account annotations | `{}` | -| `serviceAccount.name` | Service account name | `tfy-llm-gateway` | -| `extraVolumes` | Extra volumes | `[]` | -| `extraVolumeMounts` | Extra volume mounts | `[]` | -| `rbac.enabled` | Enable rbac | `true` | -| `autoscaling.enabled` | Enable autoscaling | `false` | -| `autoscaling.minReplicas` | Minimum number of replicas | `3` | -| `autoscaling.maxReplicas` | Maximum number of replicas | `100` | -| `autoscaling.targetCPUUtilizationPercentage` | Target CPU utilization percentage | `60` | +| Name | Description | Value | +| -------------------------------------------- | ------------------------------------------ | ------------------------------------------------- | +| `global.controlPlaneURL` | Control plane URL | `""` | +| `global.truefoundryReleaseName` | Truefoundry release name | `truefoundry` | +| `global.llmGatewayInfra.enabled` | Bool if llm gateway infra is enabled | `false` | +| `global.llmGatewayInfra.releaseName` | Release name for the tfy-llm-gateway-infra | `tfy-llm-gateway-infra` | +| `global.llmGatewayInfra.natsAdminPassword` | NATS admin password | `""` | +| `image.repository` | Image repository for tfyLLMGateway | `tfy.jfrog.io/tfy-private-images/tfy-llm-gateway` | +| `image.tag` | Image tag for the tfyLLMGateway | `86142d80c82e353061de0b1e22b0c78a8d4d86d5` | +| `fullnameOverride` | Full name override for the tfy-llm-gateway | `""` | +| `replicaCount` | Number of replicas | `1` | +| `environmentName` | The environment name | `default` | +| `envSecretName` | The environment secret name | `tfy-llm-gateway-env-secret` | +| `imagePullPolicy` | Image pull policy | `IfNotPresent` | +| `nameOverride` | Name override | `""` | +| `podAnnotations` | Pod annotations | `{}` | +| `podSecurityContext` | Pod security context | `{}` | +| `commonLabels` | Common labels | `{}` | +| `securityContext` | Security context configuration | `{}` | +| `healthcheck.enabled` | Enable healthcheck | `true` | +| `healthcheck.readiness.port` | Port to probe | `8787` | +| `healthcheck.readiness.path` | Path to probe | `/` | +| `healthcheck.readiness.initialDelaySeconds` | Initial delay in seconds | `10` | +| `healthcheck.readiness.periodSeconds` | Period in seconds | `10` | +| `healthcheck.readiness.timeoutSeconds` | Timeout in seconds | `5` | +| `healthcheck.readiness.successThreshold` | Success threshold | `1` | +| `healthcheck.readiness.failureThreshold` | Failure threshold | `3` | +| `healthcheck.liveness.port` | Port to probe | `8787` | +| `healthcheck.liveness.path` | Path to probe | `/` | +| `resources.limits.cpu` | CPU limit | `2` | +| `resources.limits.memory` | Memory limit | `1024Mi` | +| `resources.limits.ephemeral-storage` | Ephemeral storage limit | `512Mi` | +| `resources.requests.cpu` | CPU request | `1` | +| `resources.requests.memory` | Memory request | `512Mi` | +| `resources.requests.ephemeral-storage` | Ephemeral storage request | `256Mi` | +| `nodeSelector` | Node selector | `{}` | +| `tolerations` | Tolerations | `{}` | +| `affinity` | Affinity | `{}` | +| `topologySpreadConstraints` | Topology spread constraints | `{}` | +| `ingress.enabled` | Enable ingress configuration | `false` | +| `ingress.annotations` | Ingress annotations | `{}` | +| `ingress.labels` | Ingress labels | `{}` | +| `ingress.ingressClassName` | Ingress class name | `istio` | +| `ingress.tls` | Ingress TLS configuration | `[]` | +| `ingress.hosts` | Ingress hosts | `[]` | +| `istio.virtualservice.enabled` | Enable virtual service | `false` | +| `istio.virtualservice.annotations` | Virtual service annotations | `{}` | +| `istio.virtualservice.gateways` | Virtual service gateways | `[]` | +| `istio.virtualservice.hosts` | Virtual service hosts | `[]` | +| `service.type` | Service type | `ClusterIP` | +| `service.port` | Service port | `8787` | +| `service.annotations` | Service annotations | `{}` | +| `serviceAccount.create` | Create service account | `true` | +| `serviceAccount.annotations` | Service account annotations | `{}` | +| `serviceAccount.name` | Service account name | `tfy-llm-gateway` | +| `extraVolumes` | Extra volumes | `[]` | +| `extraVolumeMounts` | Extra volume mounts | `[]` | +| `rbac.enabled` | Enable rbac | `true` | +| `autoscaling.enabled` | Enable autoscaling | `true` | +| `autoscaling.minReplicas` | Minimum number of replicas | `3` | +| `autoscaling.maxReplicas` | Maximum number of replicas | `100` | +| `autoscaling.targetCPUUtilizationPercentage` | Target CPU utilization percentage | `60` | diff --git a/charts/tfy-llm-gateway/values.yaml b/charts/tfy-llm-gateway/values.yaml index 5cfce742c..a1d250767 100644 --- a/charts/tfy-llm-gateway/values.yaml +++ b/charts/tfy-llm-gateway/values.yaml @@ -1,13 +1,27 @@ ## @section Configuration for LLM Gateway -## @param global Truefoundry global values -global: {} +global: + ## @param global.controlPlaneURL Control plane URL + controlPlaneURL: "" + ## @param global.truefoundryReleaseName Truefoundry release name + truefoundryReleaseName: "truefoundry" + llmGatewayInfra: + ## @param global.llmGatewayInfra.enabled Bool if llm gateway infra is enabled + enabled: false + ## @param global.llmGatewayInfra.releaseName Release name for the tfy-llm-gateway-infra + releaseName: "tfy-llm-gateway-infra" + ## @param global.llmGatewayInfra.natsAdminPassword NATS admin password + natsAdminPassword: "" + ## Image configuration for llm-gateway image: ## @param image.repository Image repository for tfyLLMGateway repository: tfy.jfrog.io/tfy-private-images/tfy-llm-gateway ## @param image.tag Image tag for the tfyLLMGateway - tag: 510cb55e8ff708cbc0b0fbdf02ea9b104bbdc846 + tag: 86142d80c82e353061de0b1e22b0c78a8d4d86d5 + +## @param fullnameOverride Full name override for the tfy-llm-gateway +fullnameOverride: "" ## @param replicaCount Number of replicas replicaCount: 1 @@ -19,8 +33,6 @@ envSecretName: tfy-llm-gateway-env-secret imagePullPolicy: IfNotPresent ## @param nameOverride Name override nameOverride: '' -## @param fullnameOverride Fullname override -fullnameOverride: '' ## @param podAnnotations Pod annotations podAnnotations: {} ## @param podSecurityContext Pod security context @@ -120,9 +132,9 @@ serviceAccount: ## @param serviceAccount.create Create service account create: true ## @param serviceAccount.annotations Service account annotations - name: tfy-llm-gateway - ## @param serviceAccount.name Service account name annotations: {} + ## @param serviceAccount.name Service account name + name: tfy-llm-gateway ## @param extraVolumes Extra volumes extraVolumes: [] ## @param extraVolumeMounts Extra volume mounts @@ -131,24 +143,24 @@ extraVolumeMounts: [] rbac: ## @param rbac.enabled Enable rbac enabled: true -## @skip env -env: - CONTROL_PLANE_URL: "" - TFY_API_KEY: ${k8s-secret/truefoundry-creds/TFY_API_KEY} - AUTH_SERVER_URL: https://auth.truefoundry.com - LOG_LEVEL: info - GATEWAY_NATS_CONFIGURATION: "" - DEPLOYED_LLM_GATEWAY_URL: "" - CONTROL_PLANE_NATS_URL: "" - ENABLE_EXTERNAL_OAUTH: "false" ## Autoscaling configuration autoscaling: ## @param autoscaling.enabled Enable autoscaling - enabled: false + enabled: true ## @param autoscaling.minReplicas Minimum number of replicas minReplicas: 3 ## @param autoscaling.maxReplicas Maximum number of replicas maxReplicas: 100 ## @param autoscaling.targetCPUUtilizationPercentage Target CPU utilization percentage targetCPUUtilizationPercentage: 60 +## @skip env Environment variables for the llmGateway +env: + CONTROL_PLANE_URL: "{{ .Values.global.controlPlaneURL }}" + TFY_API_KEY: ${k8s-secret/truefoundry-creds/TFY_API_KEY} + AUTH_SERVER_URL: https://auth.truefoundry.com + LOG_LEVEL: info + GATEWAY_NATS_CONFIGURATION: '{"type":"nats","url":"ws://{{ .Values.global.llmGatewayInfra.releaseName }}-nats.{{ .Release.Namespace }}.svc.cluster.local:8080","username":"admin","password":"{{ .Values.global.llmGatewayInfra.natsAdminPassword }}"}' + DEPLOYED_LLM_GATEWAY_URL: "{{ .Values.global.controlPlaneURL }}/api/llm" + CONTROL_PLANE_NATS_URL: ws://{{ .Values.global.truefoundryReleaseName }}-nats.{{ .Release.Namespace }}.svc.cluster.local:8080 + ENABLE_EXTERNAL_OAUTH: "false" diff --git a/charts/tfy-notebook-controller/README.md b/charts/tfy-notebook-controller/README.md index b70a8b1fa..ecce3e281 100644 --- a/charts/tfy-notebook-controller/README.md +++ b/charts/tfy-notebook-controller/README.md @@ -6,61 +6,62 @@ This Helm chart package, provided by TrueFoundry, contains configurations and re ### tfy-notebook-controller configurations -| Name | Description | Value | -| -------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels.app` | Label for the application. | `notebook-controller` | -| `labels.kustomize.component` | Label for the Kustomize component. | `notebook-controller` | -| `namespaceOverride` | Namespace override for the notebook controller. | `""` | -| `imagePullSecrets` | Image pull secrets for the notebook controller. | `[]` | -| `istioGateway` | Istio Gateway for the notebook controller. | `istio-system/tfy-wildcard` | -| `image.pullPolicy` | Pull Policy for notebook controller. | `IfNotPresent` | -| `image.repository` | Image repository for the notebook controller. | `tfy.jfrog.io/tfy-images/tfy-notebook-controller` | -| `image.tag` | Image tag for the notebook controller. | `2ea1ddca29998b7b87a86eead70237b21d86f220` | -| `resources.limits.cpu` | CPU limit for the notebook controller. | `100m` | -| `resources.limits.memory` | Memory limit for the notebook controller. | `256Mi` | -| `resources.limits.ephemeral-storage` | Ephemeral storage limit for the notebook controller. | `256Mi` | -| `resources.requests.cpu` | CPU request for the notebook controller. | `50m` | -| `resources.requests.memory` | Memory request for the notebook controller. | `128Mi` | -| `resources.requests.ephemeral-storage` | Ephemeral storage request for the notebook controller. | `128Mi` | -| `tolerations` | list of tolerations | `[]` | -| `affinity` | Affinity for the notebook controller deployment | `{}` | -| `notebookBaseDomainUrl` | Base domain URL for the notebook. | `` | -| `oauth.enabled` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `false` | -| `oauth.type` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `truefoundry` | -| `oauth.clientId` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `""` | -| `oauth.tokenEndpoint` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `""` | -| `oauth.authorizationEndpoint` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `""` | -| `oauth.jwksUri` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `""` | -| `oauth.clientSecret` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `""` | -| `oauth.hmac` | HMAC key used for encoding/decoding tokens in OAuth2. This key is crucial for maintaining token integrity and security. | `""` | -| `oauth.truefoundryExternalAuthorization.enabled` | Enable Truefoundry authorization for notebooks deployed. | `true` | -| `oauth.truefoundryExternalAuthorization.controlPlaneURL` | Control plane URL for TrueFoundry | `""` | -| `oauth.truefoundryExternalAuthorization.image` | Image of Truefoundry external authorization server | `tfy.jfrog.io/tfy-images/ext-authz-server:762b460727a2d2627216b0e5c7b311711a739885` | -| `sdsServer.replicas` | Number of replicas of sds server | `2` | -| `sdsServer.image` | image for sds-server | `tfy.jfrog.io/tfy-images/sds-server:bea4f5dbc0aafad1d3ada37f5a2b22a188318c92` | -| `sdsServer.affinity` | Node affinity for sds-server | `{}` | -| `sdsServer.imagePullSecrets` | Image pull credentials for sds-server | `[]` | -| `sdsServer.command` | Command and arguments to start the sds-server application. | `["/app/sds-server","--port","8000","--file","/secrets/secrets.yaml"]` | -| `sdsServer.readinessProbe.tcpSocket.port` | Port for TCP socket used in readiness probe | `8000` | -| `sdsServer.readinessProbe.initialDelaySeconds` | Initial delay before performing readiness probe | `15` | -| `sdsServer.readinessProbe.periodSeconds` | Frequency of performing readiness probe | `10` | -| `sdsServer.livenessProbe.tcpSocket.port` | Port for TCP socket used in liveness probe | `8000` | -| `sdsServer.livenessProbe.initialDelaySeconds` | Initial delay before performing liveness probe | `15` | -| `sdsServer.livenessProbe.periodSeconds` | Frequency of performing liveness probe | `10` | -| `sdsServer.deploymentStrategy.type` | Type of deployment strategy | `RollingUpdate` | -| `sdsServer.deploymentStrategy.rollingUpdate.maxSurge` | Max pods above desired number | `1` | -| `sdsServer.deploymentStrategy.rollingUpdate.maxUnavailable` | Max pods unavailable during update | `1` | -| `sdsServer.ports[0].containerPort` | The port on which the container is listening. | `8000` | -| `sdsServer.ports[0].name` | The name assigned to this port. | `port-8000` | -| `sdsServer.ports[0].protocol` | The protocol used by this port (TCP/UDP). | `TCP` | -| `sdsServer.resources.limits.cpu` | The maximum CPU resources allocated. | `0.02` | -| `sdsServer.resources.limits.ephemeral-storage` | The maximum ephemeral storage allocated. | `20M` | -| `sdsServer.resources.limits.memory` | The maximum memory resources allocated. | `50M` | -| `sdsServer.resources.requests.cpu` | The minimum CPU resources requested. | `0.01` | -| `sdsServer.resources.requests.ephemeral-storage` | The minimum ephemeral storage requested. | `10M` | -| `sdsServer.resources.requests.memory` | The minimum memory resources requested. | `30M` | -| `sdsServer.tolerations` | Spot tolerations for the notebook controller deployment. | `[]` | -| `sdsServer.topologySpreadConstraints[0].labelSelector.matchLabels.truefoundry.com/component` | Component label for the sds-server. | `{"replicas":2,"image":"tfy.jfrog.io/tfy-images/sds-server:bea4f5dbc0aafad1d3ada37f5a2b22a188318c92","affinity":{},"imagePullSecrets":[],"command":["/app/sds-server","--port","8000","--file","/secrets/secrets.yaml"],"readinessProbe":{"tcpSocket":{"port":8000},"initialDelaySeconds":15,"periodSeconds":10},"livenessProbe":{"tcpSocket":{"port":8000},"initialDelaySeconds":15,"periodSeconds":10},"deploymentStrategy":{"type":"RollingUpdate","rollingUpdate":{"maxSurge":1,"maxUnavailable":1}},"ports":[{"containerPort":8000,"name":"port-8000","protocol":"TCP"}],"resources":{"limits":{"cpu":0.02,"ephemeral-storage":"20M","memory":"50M"},"requests":{"cpu":0.01,"ephemeral-storage":"10M","memory":"30M"}},"tolerations":[{"effect":"NoSchedule","key":"cloud.google.com/gke-spot","operator":"Equal","value":"true"},{"effect":"NoSchedule","key":"kubernetes.azure.com/scalesetpriority","operator":"Equal","value":"spot"}],"topologySpreadConstraints":[{"labelSelector":{"matchLabels":{"truefoundry.com/component":"sds-server"}},"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]}` | -| `sdsServer.topologySpreadConstraints[0].maxSkew` | Define the maximum skew of pods across topology domains. | `1` | -| `sdsServer.topologySpreadConstraints[0].topologyKey` | The key for the node labels used in determining the topology spread. | `topology.kubernetes.io/zone` | -| `sdsServer.topologySpreadConstraints[0].whenUnsatisfiable` | Behavior policy when spreading constraints cannot be satisfied. | `ScheduleAnyway` | +| Name | Description | Value | +| -------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels.app` | Label for the application. | `notebook-controller` | +| `labels.kustomize.component` | Label for the Kustomize component. | `notebook-controller` | +| `namespaceOverride` | Namespace override for the notebook controller. | `""` | +| `imagePullSecrets` | Image pull secrets for the notebook controller. | `[]` | +| `istioGateway` | Istio Gateway for the notebook controller. | `istio-system/tfy-wildcard` | +| `image.pullPolicy` | Pull Policy for notebook controller. | `IfNotPresent` | +| `image.repository` | Image repository for the notebook controller. | `tfy.jfrog.io/tfy-images/tfy-notebook-controller` | +| `image.tag` | Image tag for the notebook controller. | `2ea1ddca29998b7b87a86eead70237b21d86f220` | +| `resources.limits.cpu` | CPU limit for the notebook controller. | `100m` | +| `resources.limits.memory` | Memory limit for the notebook controller. | `256Mi` | +| `resources.limits.ephemeral-storage` | Ephemeral storage limit for the notebook controller. | `256Mi` | +| `resources.requests.cpu` | CPU request for the notebook controller. | `50m` | +| `resources.requests.memory` | Memory request for the notebook controller. | `128Mi` | +| `resources.requests.ephemeral-storage` | Ephemeral storage request for the notebook controller. | `128Mi` | +| `tolerations` | list of tolerations | `[]` | +| `affinity` | Affinity for the notebook controller deployment | `{}` | +| `notebookBaseDomainUrl` | Base domain URL for the notebook. | `` | +| `oauth.enabled` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `false` | +| `oauth.type` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `truefoundry` | +| `oauth.clientId` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `""` | +| `oauth.tokenEndpoint` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `""` | +| `oauth.authorizationEndpoint` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `""` | +| `oauth.jwksUri` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `""` | +| `oauth.clientSecret` | Secret key used for OAuth2 authentication. This key should be kept confidential. | `""` | +| `oauth.hmac` | HMAC key used for encoding/decoding tokens in OAuth2. This key is crucial for maintaining token integrity and security. | `""` | +| `oauth.truefoundryExternalAuthorization.enabled` | Enable Truefoundry authorization for notebooks deployed. | `true` | +| `oauth.truefoundryExternalAuthorization.controlPlaneURL` | Control plane URL for TrueFoundry | `""` | +| `oauth.truefoundryExternalAuthorization.image` | Image of Truefoundry external authorization server | `tfy.jfrog.io/tfy-images/ext-authz-server:762b460727a2d2627216b0e5c7b311711a739885` | +| `sdsServer.replicas` | Number of replicas of sds server | `2` | +| `sdsServer.image.repository` | Repository for sds-server | `tfy.jfrog.io/tfy-images/sds-server` | +| `sdsServer.image.tag` | Tag for sds-server | `bea4f5dbc0aafad1d3ada37f5a2b22a188318c92` | +| `sdsServer.affinity` | Node affinity for sds-server | `{}` | +| `sdsServer.imagePullSecrets` | Image pull credentials for sds-server | `[]` | +| `sdsServer.command` | Command and arguments to start the sds-server application. | `["/app/sds-server","--port","8000","--file","/secrets/secrets.yaml"]` | +| `sdsServer.readinessProbe.tcpSocket.port` | Port for TCP socket used in readiness probe | `8000` | +| `sdsServer.readinessProbe.initialDelaySeconds` | Initial delay before performing readiness probe | `15` | +| `sdsServer.readinessProbe.periodSeconds` | Frequency of performing readiness probe | `10` | +| `sdsServer.livenessProbe.tcpSocket.port` | Port for TCP socket used in liveness probe | `8000` | +| `sdsServer.livenessProbe.initialDelaySeconds` | Initial delay before performing liveness probe | `15` | +| `sdsServer.livenessProbe.periodSeconds` | Frequency of performing liveness probe | `10` | +| `sdsServer.deploymentStrategy.type` | Type of deployment strategy | `RollingUpdate` | +| `sdsServer.deploymentStrategy.rollingUpdate.maxSurge` | Max pods above desired number | `1` | +| `sdsServer.deploymentStrategy.rollingUpdate.maxUnavailable` | Max pods unavailable during update | `1` | +| `sdsServer.ports[0].containerPort` | The port on which the container is listening. | `8000` | +| `sdsServer.ports[0].name` | The name assigned to this port. | `port-8000` | +| `sdsServer.ports[0].protocol` | The protocol used by this port (TCP/UDP). | `TCP` | +| `sdsServer.resources.limits.cpu` | The maximum CPU resources allocated. | `0.02` | +| `sdsServer.resources.limits.ephemeral-storage` | The maximum ephemeral storage allocated. | `20M` | +| `sdsServer.resources.limits.memory` | The maximum memory resources allocated. | `50M` | +| `sdsServer.resources.requests.cpu` | The minimum CPU resources requested. | `0.01` | +| `sdsServer.resources.requests.ephemeral-storage` | The minimum ephemeral storage requested. | `10M` | +| `sdsServer.resources.requests.memory` | The minimum memory resources requested. | `30M` | +| `sdsServer.tolerations` | Spot tolerations for the notebook controller deployment. | `[]` | +| `sdsServer.topologySpreadConstraints[0].labelSelector.matchLabels.truefoundry.com/component` | Component label for the sds-server. | `{"replicas":2,"image":{"repository":"tfy.jfrog.io/tfy-images/sds-server","tag":"bea4f5dbc0aafad1d3ada37f5a2b22a188318c92"},"affinity":{},"imagePullSecrets":[],"command":["/app/sds-server","--port","8000","--file","/secrets/secrets.yaml"],"readinessProbe":{"tcpSocket":{"port":8000},"initialDelaySeconds":15,"periodSeconds":10},"livenessProbe":{"tcpSocket":{"port":8000},"initialDelaySeconds":15,"periodSeconds":10},"deploymentStrategy":{"type":"RollingUpdate","rollingUpdate":{"maxSurge":1,"maxUnavailable":1}},"ports":[{"containerPort":8000,"name":"port-8000","protocol":"TCP"}],"resources":{"limits":{"cpu":0.02,"ephemeral-storage":"20M","memory":"50M"},"requests":{"cpu":0.01,"ephemeral-storage":"10M","memory":"30M"}},"tolerations":[{"effect":"NoSchedule","key":"cloud.google.com/gke-spot","operator":"Equal","value":"true"},{"effect":"NoSchedule","key":"kubernetes.azure.com/scalesetpriority","operator":"Equal","value":"spot"}],"topologySpreadConstraints":[{"labelSelector":{"matchLabels":{"truefoundry.com/component":"sds-server"}},"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]}` | +| `sdsServer.topologySpreadConstraints[0].maxSkew` | Define the maximum skew of pods across topology domains. | `1` | +| `sdsServer.topologySpreadConstraints[0].topologyKey` | The key for the node labels used in determining the topology spread. | `topology.kubernetes.io/zone` | +| `sdsServer.topologySpreadConstraints[0].whenUnsatisfiable` | Behavior policy when spreading constraints cannot be satisfied. | `ScheduleAnyway` | diff --git a/charts/tfy-notebook-controller/templates/sds-deployment.yaml b/charts/tfy-notebook-controller/templates/sds-deployment.yaml index 5de0ede02..196f6a5f7 100644 --- a/charts/tfy-notebook-controller/templates/sds-deployment.yaml +++ b/charts/tfy-notebook-controller/templates/sds-deployment.yaml @@ -39,7 +39,7 @@ spec: {{- with .Values.sdsServer.command }} {{- toYaml . | nindent 12 }} {{- end }} - image: "{{ .Values.sdsServer.image }}" + image: {{ .Values.sdsServer.image.repository }}:{{ .Values.sdsServer.image.tag }} imagePullPolicy: IfNotPresent name: sds-server readinessProbe: diff --git a/charts/tfy-notebook-controller/values.yaml b/charts/tfy-notebook-controller/values.yaml index 19f2f4646..5f81c40aa 100644 --- a/charts/tfy-notebook-controller/values.yaml +++ b/charts/tfy-notebook-controller/values.yaml @@ -104,8 +104,11 @@ oauth: sdsServer: ## @param sdsServer.replicas Number of replicas of sds server replicas: 2 - ## @param sdsServer.image image for sds-server - image: "tfy.jfrog.io/tfy-images/sds-server:bea4f5dbc0aafad1d3ada37f5a2b22a188318c92" + image: + ## @param sdsServer.image.repository Repository for sds-server + repository: tfy.jfrog.io/tfy-images/sds-server + ## @param sdsServer.image.tag Tag for sds-server + tag: "bea4f5dbc0aafad1d3ada37f5a2b22a188318c92" ## @param sdsServer.affinity [object] Node affinity for sds-server affinity: {} ## @param sdsServer.imagePullSecrets [array] Image pull credentials for sds-server diff --git a/charts/truefoundry/Chart.lock b/charts/truefoundry/Chart.lock index e5a0790e0..1e92e9406 100644 --- a/charts/truefoundry/Chart.lock +++ b/charts/truefoundry/Chart.lock @@ -10,6 +10,6 @@ dependencies: version: 15.2.2 - name: tfy-buildkitd-service repository: https://truefoundry.github.io/infra-charts/ - version: 0.2.1-rc.1 -digest: sha256:0f3eab55d9395afea9ba9382c532dd53c61acc58c7e1ea87adb85a5a51dd1c71 -generated: "2024-11-08T15:07:07.114652559+05:30" + version: 0.2.1 +digest: sha256:5140d88c4f62122247e888f70adbfc4b684adc7ce3fa34b46347c8fbaf2d65c0 +generated: "2024-11-22T17:27:46.263843+05:30" diff --git a/charts/truefoundry/Chart.yaml b/charts/truefoundry/Chart.yaml index 28f45a211..f390febee 100644 --- a/charts/truefoundry/Chart.yaml +++ b/charts/truefoundry/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: truefoundry -version: 0.12.1 +version: 0.13.4 description: "TrueFoundry Control Plane Components" maintainers: - name: truefoundry @@ -20,4 +20,4 @@ dependencies: - condition: tfy-buildkitd-service.enabled name: tfy-buildkitd-service repository: https://truefoundry.github.io/infra-charts/ - version: 0.2.1-rc.1 + version: 0.2.1 diff --git a/charts/truefoundry/README.md b/charts/truefoundry/README.md index 2c4d8cf67..23c99ce6f 100644 --- a/charts/truefoundry/README.md +++ b/charts/truefoundry/README.md @@ -11,7 +11,7 @@ truefoundry is an applications that gets deployed on the kubernetes cluster to s | `global.truefoundryImagePullConfigJSON` | JSON config for image pull secret | `""` | | `global.tenantName` | Name of the tenant | `""` | | `global.controlPlaneURL` | URL of the control plane | `http://truefoundry-truefoundry-frontend-app.truefoundry.svc.cluster.local:5000` | -| `global.controlPlaneChartVersion` | Version of control-plane chart | `0.12.1` | +| `global.controlPlaneChartVersion` | Version of control-plane chart | `0.6.2` | | `global.existingTruefoundryCredsSecret` | Name of the existing truefoundry creds secret | `""` | | `global.database.host` | Control plane database hostname when dev mode is not enabled | `""` | | `global.database.name` | Control plane database name when dev mode is not enabled | `""` | @@ -23,20 +23,19 @@ truefoundry is an applications that gets deployed on the kubernetes cluster to s ### Truefoundry bootstrap values -| Name | Description | Value | -| ----------------------------------------------------- | ------------------------------------------------------- | ---------------------------------------- | -| `truefoundryBootstrap.enabled` | Bool to enable truefoundry bootstrap | `true` | -| `truefoundryBootstrap.image.repository` | Truefoundry bootstrap image repository | `tfy.jfrog.io/tfy-mirror/library/ubuntu` | -| `truefoundryBootstrap.image.tag` | Truefoundry bootstrap image tag | `latest` | -| `truefoundryBootstrap.natsConfigmapName` | Truefoundry nats configmap name | `nats-accounts` | -| `truefoundryBootstrap.extraEnvVars` | Extra environment variables for the bootstrap container | `[]` | -| `truefoundryBootstrap.extraVolumeMounts` | Extra volume mounts for the bootstrap container | `[]` | -| `truefoundryBootstrap.extraVolumes` | Extra volumes for the bootstrap container | `[]` | -| `truefoundryBootstrap.affinity` | Affinity for the bootstrap container | `{}` | -| `truefoundryBootstrap.nodeSelector` | Node selector for the bootstrap container | `{}` | -| `truefoundryBootstrap.tolerations` | Tolerations specific to the bootstrap container | `{}` | -| `truefoundryBootstrap.imagePullSecrets` | Image pull secrets for the bootstrap container | `[]` | -| `truefoundryBootstrap.createdBuildkitServiceTlsCerts` | Bool to install TLS certificates | `false` | +| Name | Description | Value | +| ---------------------------------------- | ------------------------------------------------------- | ---------------------------------------- | +| `truefoundryBootstrap.enabled` | Bool to enable truefoundry bootstrap | `true` | +| `truefoundryBootstrap.image.repository` | Truefoundry bootstrap image repository | `tfy.jfrog.io/tfy-mirror/library/ubuntu` | +| `truefoundryBootstrap.image.tag` | Truefoundry bootstrap image tag | `latest` | +| `truefoundryBootstrap.natsConfigmapName` | Truefoundry nats configmap name | `nats-accounts` | +| `truefoundryBootstrap.extraEnvVars` | Extra environment variables for the bootstrap container | `[]` | +| `truefoundryBootstrap.extraVolumeMounts` | Extra volume mounts for the bootstrap container | `[]` | +| `truefoundryBootstrap.extraVolumes` | Extra volumes for the bootstrap container | `[]` | +| `truefoundryBootstrap.affinity` | Affinity for the bootstrap container | `{}` | +| `truefoundryBootstrap.nodeSelector` | Node selector for the bootstrap container | `{}` | +| `truefoundryBootstrap.tolerations` | Tolerations specific to the bootstrap container | `{}` | +| `truefoundryBootstrap.imagePullSecrets` | Image pull secrets for the bootstrap container | `[]` | ### Truefoundry Frontend App values @@ -47,7 +46,7 @@ truefoundry is an applications that gets deployed on the kubernetes cluster to s | `truefoundryFrontendApp.replicaCount` | Number of replicas for the frontend app | `1` | | `truefoundryFrontendApp.global` | Global values for the frontend app | `{}` | | `truefoundryFrontendApp.image.repository` | Image repository for the frontend app | `tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app` | -| `truefoundryFrontendApp.image.tag` | Image tag for the frontend app | `v0.11.0` | +| `truefoundryFrontendApp.image.tag` | Image tag for the frontend app | `v0.5.1` | | `truefoundryFrontendApp.envSecretName` | Secret name for the frontend app environment variables | `truefoundry-frontend-app-env-secret` | | `truefoundryFrontendApp.imagePullPolicy` | Image pull policy for the frontend app | `IfNotPresent` | | `truefoundryFrontendApp.nameOverride` | Override name for the frontend app | `""` | @@ -96,7 +95,7 @@ truefoundry is an applications that gets deployed on the kubernetes cluster to s | `mlfoundryServer.enabled` | Bool to enable the mlfoundry server | `true` | | `mlfoundryServer.tolerations` | Tolerations specific to the mlfoundry server | `{}` | | `mlfoundryServer.image.repository` | Image repository for the mlfoundry server | `tfy.jfrog.io/tfy-private-images/mlfoundry-server` | -| `mlfoundryServer.image.tag` | Image tag for the mlfoundry server | `v0.9.0` | +| `mlfoundryServer.image.tag` | Image tag for the mlfoundry server | `v0.4.0` | | `mlfoundryServer.replicaCount` | Number of replicas for the mlfoundry server | `1` | | `mlfoundryServer.environmentName` | Environment name for the mlfoundry server | `default` | | `mlfoundryServer.envSecretName` | Secret name for the mlfoundry server environment variables | `mlfoundry-server-env-secret` | @@ -133,7 +132,7 @@ truefoundry is an applications that gets deployed on the kubernetes cluster to s | `servicefoundryServer.replicaCount` | Number of replicas for the servicefoundry server | `1` | | `servicefoundryServer.global` | Global values for the servicefoundry server | `{}` | | `servicefoundryServer.image.repository` | Image repository for the servicefoundry server | `tfy.jfrog.io/tfy-private-images/servicefoundry-server` | -| `servicefoundryServer.image.tag` | Image tag for the servicefoundry server | `v0.12.0` | +| `servicefoundryServer.image.tag` | Image tag for the servicefoundry server | `v0.6.1` | | `servicefoundryServer.environmentName` | Environment name for the servicefoundry server | `default` | | `servicefoundryServer.envSecretName` | Secret name for the servicefoundry server environment variables | `servicefoundry-server-env-secret` | | `servicefoundryServer.imagePullPolicy` | Image pull policy for the servicefoundry server | `IfNotPresent` | @@ -161,9 +160,7 @@ truefoundry is an applications that gets deployed on the kubernetes cluster to s | `servicefoundryServer.imagePullSecrets` | Image pull credentials for servicefoundry server | `[]` | | `servicefoundryServer.rbac.enabled` | Enable RBAC for the servicefoundry server | `true` | | `servicefoundryServer.configs.cicdTemplates` | CICD Template for servicefoundry server | `{{ .Release.Name }}-cicd-templates-cm` | -| `servicefoundryServer.configs.workbenchImages` | Workbench Images for workbench deployments | `{{ .Release.Name }}-workbench-images-cm` | -| `servicefoundryServer.configs.imageMutationPolicy` | Image Mutations policy for workloads | `{{ .Release.Name }}-image-mutation-policy-cm` | -| `servicefoundryServer.configs.k8sManifestValidationPolicy` | K8s Manifest Validation policy for workloads | `{{ .Release.Name }}-k8s-manifest-validation-policy-cm` | +| `servicefoundryServer.configs.workbenchImages` | Workbench Images for servicefoundry server | `{{ .Release.Name }}-workbench-images-cm` | ### tfyK8sController Truefoundry tfy k8s controller values @@ -174,7 +171,7 @@ truefoundry is an applications that gets deployed on the kubernetes cluster to s | `tfyK8sController.replicaCount` | Number of replicas for the tfyK8sController | `1` | | `tfyK8sController.global` | Global values for the tfyK8sController | `{}` | | `tfyK8sController.image.repository` | Image repository for the tfyK8sController | `tfy.jfrog.io/tfy-private-images/tfy-k8s-controller` | -| `tfyK8sController.image.tag` | Image tag for the tfyK8sController | `v0.9.0` | +| `tfyK8sController.image.tag` | Image tag for the tfyK8sController | `v0.3.0` | | `tfyK8sController.environmentName` | Environment name for tfyK8sController | `default` | | `tfyK8sController.envSecretName` | Secret name for the tfyK8sController environment variables | `tfy-k8s-controller-env-secret` | | `tfyK8sController.imagePullPolicy` | Image pull policy for the tfyK8sController | `IfNotPresent` | @@ -209,7 +206,7 @@ truefoundry is an applications that gets deployed on the kubernetes cluster to s | `sfyManifestService.tolerations` | Tolerations specific to the sfy manifest service | `{}` | | `sfyManifestService.global` | Global values for the sfy manifest service | `{}` | | `sfyManifestService.image.repository` | Image repository for the sfy manifest service | `tfy.jfrog.io/tfy-private-images/sfy-manifest-service` | -| `sfyManifestService.image.tag` | Image tag for the sfy manifest service | `v0.9.0` | +| `sfyManifestService.image.tag` | Image tag for the sfy manifest service | `v0.3.0` | | `sfyManifestService.replicaCount` | Number of replicas for the sfy manifest service | `1` | | `sfyManifestService.environmentName` | Environment name for the sfy manifest service | `default` | | `sfyManifestService.envSecretName` | Secret name for the sfy manifest service environment variables | `sfy-manifest-service-env-secret` | @@ -277,7 +274,7 @@ truefoundry is an applications that gets deployed on the kubernetes cluster to s | `nats.cluster.replicas` | Number of replicas for NATS cluster | `3` | | `nats.cluster.noAdvertise` | Bool to enable NATS cluster | `true` | | `nats.websocket.enabled` | Bool to enable NATS websocket | `true` | -| `nats.websocket.port` | Port for NATS websocket | `8080` | +| `nats.websocket.port` | Port for NATS websocket | `443` | | `nats.websocket.noTLS` | Bool to enable NATS websocket without TLS | `true` | | `nats.websocket.sameOrigin` | Bool to enable NATS websocket same origin | `false` | | `nats.websocket.allowedOrigins` | Allowed origins for NATS websocket | `[]` | @@ -286,68 +283,61 @@ truefoundry is an applications that gets deployed on the kubernetes cluster to s ### tfyBuild Truefoundry tfy build settings -| Name | Description | Value | -| ------------------------------------------------------------------------------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tfyBuild.enabled` | Bool to enable the tfyBuild server | `true` | -| `tfyBuild.global` | Global values for the tfyBuild server | `{}` | -| `tfyBuild.nameOverride` | Override name for the tfyBuild server | `""` | -| `tfyBuild.fullnameOverride` | Full name override for the tfyBuild server | `""` | -| `tfyBuild.serviceAccount.annotations` | Annotations for the tfyBuild server service account | `{}` | -| `tfyBuild.preemptibleDeployment.enabled` | Bool to enable preemptible deployment for the tfyBuild server | `false` | -| `tfyBuild.preemptibleDeployment.image.repository` | Repository for the preemptible deployment | `tfy.jfrog.io/tfy-mirror/alpine` | -| `tfyBuild.preemptibleDeployment.image.tag` | Tag for the preemptible deployment | `3.20` | -| `tfyBuild.preemptibleDeployment.imagePullSecrets` | Image pull secrets for the preemptible deployment | `[]` | -| `tfyBuild.preemptibleDeployment.affinity` | Affinity settings for the preemptible deployment | `{}` | -| `tfyBuild.preemptibleDeployment.nodeSelector` | Node selector for the preemptible deployment | `{}` | -| `tfyBuild.preemptibleDeployment.tolerations` | Tolerations for the preemptible deployment | `[]` | -| `tfyBuild.preemptibleDeployment.extraEnvs` | Extra environment variables for the tfyBuild server | `[]` | -| `tfyBuild.preemptibleDeployment.extraVolumeMounts` | Extra volume mounts for the tfyBuild server | `[]` | -| `tfyBuild.preemptibleDeployment.extraVolumes` | Extra volumes for the tfyBuild server | `[]` | -| `tfyBuild.truefoundryWorkflows.sfyBuilder.image.repository` | Repository for the sfyBuilder | `tfy.jfrog.io/tfy-images/sfy-builder` | -| `tfyBuild.truefoundryWorkflows.sfyBuilder.image.tag` | Tag for the sfyBuilder | `v0.8.6` | -| `tfyBuild.truefoundryWorkflows.sfyBuilder.imagePullSecrets` | Image pull secrets for the sfyBuilder | `[]` | -| `tfyBuild.truefoundryWorkflows.sfyBuilder.baseImagePullSecret` | baseImagePullSecret for the docker config | `""` | +| Name | Description | Value | +| ------------------------------------------------------------------------------- | ------------------------------------------------------------- | ---------------------------------------------------------------------------------------- | +| `tfyBuild.enabled` | Bool to enable the tfyBuild server | `true` | +| `tfyBuild.global` | Global values for the tfyBuild server | `{}` | +| `tfyBuild.nameOverride` | Override name for the tfyBuild server | `""` | +| `tfyBuild.fullnameOverride` | Full name override for the tfyBuild server | `""` | +| `tfyBuild.serviceAccount.annotations` | Annotations for the tfyBuild server service account | `{}` | +| `tfyBuild.preemptibleDeployment.enabled` | Bool to enable preemptible deployment for the tfyBuild server | `false` | +| `tfyBuild.preemptibleDeployment.image.repository` | Repository for the preemptible deployment | `tfy.jfrog.io/tfy-mirror/alpine` | +| `tfyBuild.preemptibleDeployment.image.tag` | Tag for the preemptible deployment | `3.20` | +| `tfyBuild.preemptibleDeployment.imagePullSecrets` | Image pull secrets for the preemptible deployment | `[]` | +| `tfyBuild.preemptibleDeployment.affinity` | Affinity settings for the preemptible deployment | `{}` | +| `tfyBuild.preemptibleDeployment.nodeSelector` | Node selector for the preemptible deployment | `{}` | +| `tfyBuild.preemptibleDeployment.tolerations` | Tolerations for the preemptible deployment | `[]` | +| `tfyBuild.preemptibleDeployment.extraEnvs` | Extra environment variables for the tfyBuild server | `[]` | +| `tfyBuild.preemptibleDeployment.extraVolumeMounts` | Extra volume mounts for the tfyBuild server | `[]` | +| `tfyBuild.preemptibleDeployment.extraVolumes` | Extra volumes for the tfyBuild server | `[]` | +| `tfyBuild.truefoundryWorkflows.sfyBuilder.image.repository` | Repository for the sfyBuilder | `tfy.jfrog.io/tfy-images/sfy-builder` | +| `tfyBuild.truefoundryWorkflows.sfyBuilder.image.tag` | Tag for the sfyBuilder | `v0.8.6` | +| `tfyBuild.truefoundryWorkflows.sfyBuilder.imagePullSecrets` | Image pull secrets for the sfyBuilder | `[]` | | `tfyBuild.truefoundryWorkflows.sfyBuilder.script` | script for the sfyBuilder to be executed | `download-code.sh registry-login.sh wait-for-builder.sh build-and-push.sh - -# This script will be executed only when all the above scripts are successfully executed. If any of the above scripts fail, this script will not be executed, and the build will be marked as failed. -update-build.sh '{"status":"SUCCEEDED"}' -` | -| `tfyBuild.truefoundryWorkflows.sfyBuilder.resources.limits.cpu` | CPU limit for the sfyBuilder | `1` | -| `tfyBuild.truefoundryWorkflows.sfyBuilder.resources.limits.ephemeral-storage` | Ephemeral storage limit for the sfyBuilder | `20Gi` | -| `tfyBuild.truefoundryWorkflows.sfyBuilder.resources.limits.memory` | Memory limit for the sfyBuilder | `2Gi` | -| `tfyBuild.truefoundryWorkflows.sfyBuilder.resources.requests.cpu` | CPU request for the sfyBuilder | `200m` | -| `tfyBuild.truefoundryWorkflows.sfyBuilder.resources.requests.ephemeral-storage` | Ephemeral storage request for the sfyBuilder | `10Gi` | -| `tfyBuild.truefoundryWorkflows.sfyBuilder.resources.requests.memory` | Memory request for the sfyBuilder | `500Mi` | -| `tfyBuild.truefoundryWorkflows.sfyBuilder.buildSecrets` | Build secrets for the sfyBuilder | `[]` | -| `tfyBuild.truefoundryWorkflows.extraEnvs` | Extra environment variables for the tfyBuild server | `[]` | -| `tfyBuild.truefoundryWorkflows.extraVolumeMounts` | Extra volume mounts for the tfyBuild server | `[]` | -| `tfyBuild.truefoundryWorkflows.extraVolumes` | Extra volumes for the tfyBuild server | `[]` | -| `tfyBuild.truefoundryWorkflows.affinity` | Affinity settings for the tfyBuild server | `{}` | -| `tfyBuild.truefoundryWorkflows.nodeSelector` | Node selector for the tfyBuild server | `{}` | -| `tfyBuild.truefoundryWorkflows.logMarkers.error` | Error log marker for the tfyBuild server | `\u001b[31m[Error]\u001b[0m` | -| `tfyBuild.truefoundryWorkflows.logMarkers.done` | Done log marker for the tfyBuild server | `\u001b[32m[Done]\u001b[0m` | -| `tfyBuild.truefoundryWorkflows.logMarkers.start` | Start log marker for the tfyBuild server | `\u001b[36m[Start]\u001b[0m` | -| `tfyBuild.truefoundryWorkflows.logMarkers.clientPrefix` | Client prefix for the tfyBuild server | `["TFY-CLIENT"]` | -| `tfyBuild.truefoundryWorkflows.logMarkers.supportSlack` | Slack support URL for the tfyBuild server | `https://join.slack.com/t/truefoundry/shared_invite/zt-11ht512jq-nDJq~HJMqc6wBw90JVlo7g` | -| `tfyBuild.truefoundryWorkflows.logMarkers.serviceFoundryUiUrl` | Service foundry UI URL | `https://app.truefoundry.com/workspace` | -| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.enabled` | Bool to enable SOCI index build and push | `false` | -| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.image.repository` | Repository for the SOCI index build and push | `tfy.jfrog.io/tfy-images/sfy-builder` | -| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.image.tag` | Tag for the SOCI index build and push | `v0.8.6` | -| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.imagePullSecrets` | Image pull secrets for the sociIndexBuildAndPush | `[]` | -| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.imageSizeThresholdBytes` | Image size threshold for the SOCI index build and push | `419430400` | -| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.extraEnvs` | Extra environment variables for the SOCI index build and push | `[]` | -| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.extraVolumeMounts` | Extra volume mounts for the SOCI index build and push | `[]` | -| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.extraVolumes` | | `[]` | -| `tfy-buildkitd-service.enabled` | Bool to enable the tfy-buildkitd service | `false` | -| `tfy-buildkitd-service.service.port` | port number for the tfy-buildkitd service | `1234` | -| `tfy-buildkitd-service.replicaCount` | Number of replicas Value kept for future use, kept 1 | `1` | -| `tfy-buildkitd-service.tls.enabled` | Enable TLS for the tfy-buildkitd service | `false` | -| `tfy-buildkitd-service.tls.buildkitClientCertsSecretName` | Name of the secret containing the TLS certificate | `tfy-buildkit-client-certs` | -| `postgresql.auth.existingSecret` | Name of the existing secret for PostgreSQL authentication | `truefoundry-postgresql-auth-secret` | -| `postgresql.auth.database` | Name of the database for PostgreSQL | `truefoundry` | +` | +| `tfyBuild.truefoundryWorkflows.sfyBuilder.baseImagePullSecret` | Base image pull secret for the sfyBuilder | `""` | +| `tfyBuild.truefoundryWorkflows.sfyBuilder.resources.limits.cpu` | CPU limit for the sfyBuilder | `1` | +| `tfyBuild.truefoundryWorkflows.sfyBuilder.resources.limits.ephemeral-storage` | Ephemeral storage limit for the sfyBuilder | `20Gi` | +| `tfyBuild.truefoundryWorkflows.sfyBuilder.resources.limits.memory` | Memory limit for the sfyBuilder | `2Gi` | +| `tfyBuild.truefoundryWorkflows.sfyBuilder.resources.requests.cpu` | CPU request for the sfyBuilder | `200m` | +| `tfyBuild.truefoundryWorkflows.sfyBuilder.resources.requests.ephemeral-storage` | Ephemeral storage request for the sfyBuilder | `10Gi` | +| `tfyBuild.truefoundryWorkflows.sfyBuilder.resources.requests.memory` | Memory request for the sfyBuilder | `500Mi` | +| `tfyBuild.truefoundryWorkflows.sfyBuilder.buildSecrets` | Build secrets for the sfyBuilder | `[]` | +| `tfyBuild.truefoundryWorkflows.extraEnvs` | Extra environment variables for the tfyBuild server | `[]` | +| `tfyBuild.truefoundryWorkflows.extraVolumeMounts` | Extra volume mounts for the tfyBuild server | `[]` | +| `tfyBuild.truefoundryWorkflows.extraVolumes` | Extra volumes for the tfyBuild server | `[]` | +| `tfyBuild.truefoundryWorkflows.affinity` | Affinity settings for the tfyBuild server | `{}` | +| `tfyBuild.truefoundryWorkflows.nodeSelector` | Node selector for the tfyBuild server | `{}` | +| `tfyBuild.truefoundryWorkflows.logMarkers.error` | Error log marker for the tfyBuild server | `\u001b[31m[Error]\u001b[0m` | +| `tfyBuild.truefoundryWorkflows.logMarkers.done` | Done log marker for the tfyBuild server | `\u001b[32m[Done]\u001b[0m` | +| `tfyBuild.truefoundryWorkflows.logMarkers.start` | Start log marker for the tfyBuild server | `\u001b[36m[Start]\u001b[0m` | +| `tfyBuild.truefoundryWorkflows.logMarkers.clientPrefix` | Client prefix for the tfyBuild server | `["TFY-CLIENT"]` | +| `tfyBuild.truefoundryWorkflows.logMarkers.supportSlack` | Slack support URL for the tfyBuild server | `https://join.slack.com/t/truefoundry/shared_invite/zt-11ht512jq-nDJq~HJMqc6wBw90JVlo7g` | +| `tfyBuild.truefoundryWorkflows.logMarkers.serviceFoundryUiUrl` | Service foundry UI URL | `https://app.truefoundry.com/workspace` | +| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.enabled` | Bool to enable SOCI index build and push | `false` | +| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.image.repository` | Repository for the SOCI index build and push | `tfy.jfrog.io/tfy-images/soci-index-builder` | +| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.image.tag` | Tag for the SOCI index build and push | `0.2.0` | +| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.imagePullSecrets` | Image pull secrets for the sociIndexBuildAndPush | `[]` | +| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.imageSizeThresholdBytes` | Image size threshold for the SOCI index build and push | `419430400` | +| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.extraEnvs` | Extra environment variables for the SOCI index build and push | `[]` | +| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.extraVolumeMounts` | Extra volume mounts for the SOCI index build and push | `[]` | +| `tfyBuild.truefoundryWorkflows.sociIndexBuildAndPush.extraVolumes` | | `[]` | +| `tfy-buildkitd-service.enabled` | Bool to enable the tfy-buildkitd service | `false` | +| `postgresql.auth.existingSecret` | Name of the existing secret for PostgreSQL authentication | `truefoundry-postgresql-auth-secret` | +| `postgresql.auth.database` | Name of the database for PostgreSQL | `truefoundry` | ### tfyController Truefoundry tfy controller settings @@ -356,7 +346,7 @@ update-build.sh '{"status":"SUCCEEDED"}' | `tfyController.enabled` | Bool to enable the tfyController | `true` | | `tfyController.global` | Global values for the tfyController | `{}` | | `tfyController.image.repository` | Image repository for the tfyController | `tfy.jfrog.io/tfy-private-images/tfy-controller` | -| `tfyController.image.tag` | Image tag for the tfyController | `v0.7.0` | +| `tfyController.image.tag` | Image tag for the tfyController | `v0.2.0` | | `tfyController.environmentName` | Environment name for the tfyController | `default` | | `tfyController.envSecretName` | Secret name for the tfyController environment variables | `sfy-manifest-service-env-secret` | | `tfyController.imagePullPolicy` | Image pull policy for the tfyController | `IfNotPresent` | @@ -382,7 +372,7 @@ update-build.sh '{"status":"SUCCEEDED"}' | `tfyWorkflowAdmin.enabled` | Bool to enable the tfyWorkflowAdmin | `false` | | `tfyWorkflowAdmin.global` | Global values for the tfyWorkflowAdmin | `{}` | | `tfyWorkflowAdmin.image.repository` | Image repository for the tfyWorkflowAdmin | `tfy.jfrog.io/tfy-private-images/tfy-workflow-admin` | -| `tfyWorkflowAdmin.image.tag` | Image tag for the tfyWorkflowAdmin | `v0.7.1` | +| `tfyWorkflowAdmin.image.tag` | Image tag for the tfyWorkflowAdmin | `v0.3.0` | | `tfyWorkflowAdmin.environmentName` | Environment name for the tfyWorkflowAdmin | `default` | | `tfyWorkflowAdmin.envSecretName` | Secret name for the tfyWorkflowAdmin environment variables | `tfy-workflow-admin-env-secret` | | `tfyWorkflowAdmin.imagePullPolicy` | Image pull policy for the tfyWorkflowAdmin | `IfNotPresent` | diff --git a/charts/truefoundry/templates/tfy-build/preemptible-deployment.yaml b/charts/truefoundry/templates/tfy-build/preemptible-deployment.yaml index 81397027e..1892f3375 100644 --- a/charts/truefoundry/templates/tfy-build/preemptible-deployment.yaml +++ b/charts/truefoundry/templates/tfy-build/preemptible-deployment.yaml @@ -23,7 +23,7 @@ spec: {{- end }} containers: - name: alpine - image: {{ .Values.tfyBuild.preemptibleDeployment.image.repository }}:{{ .Values.tfyBuild.preemptibleDeployment.image.tag }} + image: {{ .Values.tfyBuild.truefoundryWorkflows.sfyBuilder.image.repository }}:{{ .Values.tfyBuild.truefoundryWorkflows.sfyBuilder.image.tag }} command: ["tail", "-f", "/dev/null"] {{- with .Values.tfyBuild.preemptibleDeployment.extraEnvs }} env: diff --git a/charts/truefoundry/tfy-build-scripts/build-and-push.sh b/charts/truefoundry/tfy-build-scripts/build-and-push.sh index 708eeac86..b87f0e650 100644 --- a/charts/truefoundry/tfy-build-scripts/build-and-push.sh +++ b/charts/truefoundry/tfy-build-scripts/build-and-push.sh @@ -40,4 +40,4 @@ echo "Time taken to build the image: $build_time seconds" echo -n "$build_time" > /opt/truefoundry/output/tfyTimeTakenToBuildImageSeconds printf "\033[36m[==== Docker logs end ====]\033[0m\n" -printf "%s Docker image built and pushed\n" "$DONE_MARKER" \ No newline at end of file +printf "$DONE_MARKER Docker image built and pushed\n" \ No newline at end of file diff --git a/charts/truefoundry/tfy-build-scripts/download-code.sh b/charts/truefoundry/tfy-build-scripts/download-code.sh index af3fb4002..7d8a89e8f 100755 --- a/charts/truefoundry/tfy-build-scripts/download-code.sh +++ b/charts/truefoundry/tfy-build-scripts/download-code.sh @@ -114,7 +114,7 @@ elif [[ $BUILD_TYPE == "azure" ]]; then elif [[ $BUILD_TYPE == "notebook_build" ]]; then : else - printf "%s Source type '%s' not supported.\n" "$FAILED_MARKER" "$BUILD_TYPE" + printf "$FAILED_MARKER Source type '%s' not supported.\n" "$BUILD_TYPE" exit 1 fi @@ -122,4 +122,4 @@ end_time=$(date +%s) source_code_download_time=$((end_time - start_time)) echo "Time taken to download the source code: $source_code_download_time seconds" echo -n "$source_code_download_time" > /opt/truefoundry/output/tfyTimeTakenToDownloadSourceCodeSeconds -printf "%s Download code completed\n" "$DONE_MARKER" \ No newline at end of file +printf "$DONE_MARKER Download code completed\n" \ No newline at end of file diff --git a/charts/truefoundry/values.yaml b/charts/truefoundry/values.yaml index c48e28680..ca4b7c631 100644 --- a/charts/truefoundry/values.yaml +++ b/charts/truefoundry/values.yaml @@ -15,7 +15,7 @@ global: ## @param global.controlPlaneURL URL of the control plane controlPlaneURL: "http://truefoundry-truefoundry-frontend-app.truefoundry.svc.cluster.local:5000" ## @param global.controlPlaneChartVersion Version of control-plane chart - controlPlaneChartVersion: 0.12.1 + controlPlaneChartVersion: 0.13.4 # If you have an existing truefoundry-creds secret, provide the name here. # This will ignore `.global.database` and `.global.tfyApiKey` values. ## @param global.existingTruefoundryCredsSecret Name of the existing truefoundry creds secret @@ -47,6 +47,16 @@ global: value: spot effect: NoSchedule operator: Equal + ## global.llmGatewayInfra LLM Gateway Infra configuration + llmGatewayInfra: + ## @param global.llmGatewayInfra.enabled Bool to enable LLM Gateway Infra + enabled: false + ## @param global.llmGatewayInfra.releaseName Release name for LLM Gateway Infra + releaseName: "tfy-llm-gateway-infra" + ## @param global.llmGatewayInfra.natsAdminPassword Password for nats admin + natsAdminPassword: "" + ## @param global.llmGatewayInfra.clickhousePassword Password for clickhouse user + clickhousePassword: "" ## devMode When enabled creates a dev installation of the control plane for testing purposes devMode: ## @param devMode.enabled Bool to enable dev mode @@ -84,7 +94,7 @@ truefoundryBootstrap: ## imagePullSecrets: [] ## @param truefoundryBootstrap.createdBuildkitServiceTlsCerts Bool to install TLS certificates - createdBuildkitServiceTlsCerts: "false" + createdBuildkitServiceTlsCerts: true ################################################################################################################### ####################################### Truefoundry Frontend App ################################################## ################################################################################################################### @@ -103,7 +113,7 @@ truefoundryFrontendApp: ## @param truefoundryFrontendApp.image.repository Image repository for the frontend app repository: "tfy.jfrog.io/tfy-private-images/truefoundry-frontend-app" ## @param truefoundryFrontendApp.image.tag Image tag for the frontend app - tag: "v0.11.0" + tag: "v0.12.2" ## @param truefoundryFrontendApp.envSecretName Secret name for the frontend app environment variables envSecretName: truefoundry-frontend-app-env-secret ## @param truefoundryFrontendApp.imagePullPolicy Image pull policy for the frontend app @@ -189,7 +199,7 @@ truefoundryFrontendApp: ## @param truefoundryFrontendApp.llmGateway.external Make LLMGateway external external: false ## @param truefoundryFrontendApp.llmGateway.backendHost Backend Host for the LLM gateway - backendHost: '{{ .Release.Name }}-llm-gateway.{{ .Release.Namespace }}.svc.cluster.local' + backendHost: 'tfy-llm-gateway.{{ .Release.Namespace }}.svc.cluster.local' ## @param truefoundryFrontendApp.llmGateway.backendPort Backend Port for the LLM gateway backendPort: 8787 ## @param truefoundryFrontendApp.proxyServerHost Proxy server host for the frontend app @@ -215,6 +225,7 @@ truefoundryFrontendApp: VITE_AUTHSERVER_URL: /api/auth VITE_MLFOUNDRY_URL: /api/ml VITE_SVCFOUNDRY_URL: /api/svc + VITE_TFYAGENT_URL: /api/agent VITE_MONITORINGFOUNDRY_URL: /api/monitoring VITE_LLM_PLAYGROUND_API_URL: /api/llm VITE_SOCKET_URL: "" @@ -223,19 +234,21 @@ truefoundryFrontendApp: VITE_MULTITENANT_ENABLED: "false" VITE_TENANT_NAME: "{{ .Values.global.tenantName }}" VITE_LOGO_URL: "" - VITE_EXPORT_AS_HELM_TYPES: service,volume,helm + VITE_EXPORT_AS_HELM_TYPES: service,volume,helm,job VITE_CREATE_AS_APPLICATION_SET: service,helm,job VITE_ENABLE_COMPANY_REGISTRATION: "false" VITE_ENABLE_FEATURE_RESOURCE_COSTS: "false" VITE_COST_ENABLED_TENANTS: "" - VITE_ENABLE_WORKFLOWS: "false" - VITE_ENABLE_RECOMMENDATION_INFOBAR: "false" + VITE_ENABLE_WORKFLOWS: "true" + VITE_ENABLE_RECOMMENDATION_INFOBAR: "true" VITE_ENABLE_AUTOPILOT: "false" VITE_AUTOPILOT_ENABLED_TENANTS: "" + VITE_ENABLE_TROUBLESHOOT: "false" + VITE_TROUBLESHOOT_ENABLED_TENANTS: "" VITE_ENABLE_PROMPT_MANAGEMENT: "false" VITE_PROMPT_MANAGEMENT_ENABLED_TENANTS: "" VITE_ENABLE_EVENTS_GRAPHS: "true" - VITE_ENABLE_CLUSTER_METRICS: "false" + VITE_ENABLE_CLUSTER_METRICS: "true" VITE_STRIPE_PUBLISHABLE_KEY: "" VITE_CREDIT_CARD_REQUIRED_DOMAINS: "" VITE_TENANT_BASE_DOMAIN: "" @@ -243,16 +256,16 @@ truefoundryFrontendApp: VITE_MANAGED_CLUSTER_ONBOARDING_SERVICE_URL: "" VITE_CIVO_RESOURCES_STRING: "" VITE_CLUSTER_ONBOARDING_FLOW_ENABLED: "false" - VITE_LLM_PLAYGROUND_ENABLED: "false" + VITE_LLM_PLAYGROUND_ENABLED: "{{ .Values.global.llmGatewayInfra.enabled }}" VITE_LLM_PLAYGROUND_ENABLE_STANDALONE: "false" VITE_LLM_PLAYGROUND_ENABLE_REDIRECT: "false" VITE_OLD_LLM_PLAYGROUND_PATH: llm-playground VITE_LLM_PLAYGROUND_PATH: llm-gateway - VITE_ENABLE_SENTRY: "false" + VITE_ENABLE_SENTRY: "true" VITE_ENABLE_PROMPT_TEMPLATES: "false" VITE_ENABLE_TOOLS_AGENTS: "false" VITE_AGENT_ENABLED_TENANTS: truefoundry,internal - VITE_BRAINFISH_WIDGET_ENABLED_TENANTS: brainfish + VITE_BRAINFISH_WIDGET_ENABLED_TENANTS: "" VITE_SENTRY_DSN: "" VITE_APEX_DOMAIN: http://truefoundry.com/ VITE_ITERATE_AI_KEY: "" @@ -263,7 +276,7 @@ truefoundryFrontendApp: VITE_ENABLE_MIXPANEL: "false" VITE_ENABLE_SCHEMA_VISUALISER: "false" VITE_SENTRY_AUTH_TOKEN: "" - VITE_SENTRY_ENVIRONMENT: "" + VITE_SENTRY_ENVIRONMENT: production VITE_MIXPANEL_TOKEN: "" VITE_APP_ENVIRONMENT: "" VITE_CRISP_WEBSITE_ID: "" @@ -293,7 +306,7 @@ mlfoundryServer: ## @param mlfoundryServer.image.repository Image repository for the mlfoundry server repository: "tfy.jfrog.io/tfy-private-images/mlfoundry-server" ## @param mlfoundryServer.image.tag Image tag for the mlfoundry server - tag: "v0.9.0" + tag: "v0.10.0" ## @param mlfoundryServer.replicaCount Number of replicas for the mlfoundry server replicaCount: 1 ## @param mlfoundryServer.environmentName Environment name for the mlfoundry server @@ -393,7 +406,7 @@ servicefoundryServer: ## @param servicefoundryServer.image.repository Image repository for the servicefoundry server repository: "tfy.jfrog.io/tfy-private-images/servicefoundry-server" ## @param servicefoundryServer.image.tag Image tag for the servicefoundry server - tag: "v0.12.0" + tag: "v0.13.3" ## @param servicefoundryServer.environmentName Environment name for the servicefoundry server environmentName: default ## @param servicefoundryServer.envSecretName Secret name for the servicefoundry server environment variables @@ -491,12 +504,11 @@ servicefoundryServer: FLYTE_ADMIN_URL: http://{{ .Release.Name }}-tfy-workflow-admin-server.{{ .Release.Namespace }}.svc.cluster.local:8089 CLUSTER_PROXY_URL: http://{{ .Release.Name }}-tfy-controller.{{ .Release.Namespace }}.svc.cluster.local:8123 BUILD_CALLBACK_URL: http://{{ .Release.Name }}-servicefoundry-server.{{ .Release.Namespace }}.svc.cluster.local:3000 - LLM_GATEWAY_URL: "" + LLM_GATEWAY_URL: "{{ .Values.global.controlPlaneURL }}/api/llm" VCS_INTEGRATION_STATE_OBJECT_HASH_SECRET: "" GITHUB_INSTALLATION_URL: "" GITHUB_PRIVATE_KEY: "" GITHUB_APP_ID: "" - GITHUB_PAT: "" BITBUCKET_CLIENT_ID: "" BITBUCKET_CLIENT_SECRET: "" BITBUCKET_APP_PASSWORD: "" @@ -505,18 +517,14 @@ servicefoundryServer: GITLAB_SCOPE: "" AZURE_CLIENT_ID: "" AZURE_CLIENT_SECRET: "" - CLICKHOUSE_HOST: "" - CLICKHOUSE_USER: "" - CLICKHOUSE_PASSWORD: "" - CLICKHOUSE_WAIT_TIMEOUT: 5m - STORAGE_CLASS_ACCESS_MODES: "" - TRUEFOUNDRY_PUBLIC_ENABLED: "" + CLICKHOUSE_ENABLED: "{{ .Values.global.llmGatewayInfra.enabled }}" + CLICKHOUSE_HOST: http://clickhouse-{{ .Values.global.llmGatewayInfra.releaseName }}.{{ .Release.Namespace }}.svc.cluster.local:8123 + CLICKHOUSE_USER: user + CLICKHOUSE_PASSWORD: "{{ .Values.global.llmGatewayInfra.clickhousePassword }}" OAUTH_PROVIDER_TYPE: TRUEFOUNDRY EXTERNAL_OAUTH_ISSUER: "" EXTERNAL_OAUTH_CLIENT_ID: "" EXTERNAL_OAUTH_CLIENT_SECRET: "" - GLOBAL_BUILDERS_BUILD_PER_URL: "0" - APPLICATION_TYPES_ENABLED_FOR_SFY_SERVER_VALIDATION: service,async-service,job,volume,notebook,codeserver,ssh-server,helm,application-set configs: ## @param servicefoundryServer.configs.cicdTemplates CICD Template for servicefoundry server cicdTemplates: '{{ .Release.Name }}-cicd-templates-cm' @@ -544,7 +552,7 @@ tfyK8sController: ## @param tfyK8sController.image.repository Image repository for the tfyK8sController repository: "tfy.jfrog.io/tfy-private-images/tfy-k8s-controller" ## @param tfyK8sController.image.tag Image tag for the tfyK8sController - tag: "v0.9.0" + tag: "v0.10.1" ## @param tfyK8sController.environmentName Environment name for tfyK8sController environmentName: default ## @param tfyK8sController.envSecretName Secret name for the tfyK8sController environment variables @@ -618,6 +626,8 @@ tfyK8sController: ANALYTICS_SERVER_URL: https://analytics.truefoundry.com AUTH_SERVER_URL: https://auth.truefoundry.com TENANT_NAME: "{{ .Values.global.tenantName }}" + ENABLE_GATEWAY_CONSUMER: "{{ .Values.global.llmGatewayInfra.enabled }}" + GATEWAY_NATS_URL: http://admin:{{ .Values.global.llmGatewayInfra.natsAdminPassword }}@{{ .Values.global.llmGatewayInfra.releaseName }}-nats.{{ .Release.Namespace }}.svc.cluster.local:4222 NODE_ENV: production ################################################################################################################### ####################################### Sfy Manifest Service ###################################################### @@ -636,7 +646,7 @@ sfyManifestService: ## @param sfyManifestService.image.repository Image repository for the sfy manifest service repository: "tfy.jfrog.io/tfy-private-images/sfy-manifest-service" ## @param sfyManifestService.image.tag Image tag for the sfy manifest service - tag: "v0.9.0" + tag: "v0.10.0" ## @param sfyManifestService.replicaCount Number of replicas for the sfy manifest service replicaCount: 1 ## @param sfyManifestService.environmentName Environment name for the sfy manifest service @@ -742,7 +752,7 @@ nats: ## @param nats.nats.image.repository NATS server image repository repository: tfy.jfrog.io/tfy-mirror/nats ## @param nats.nats.image.tag NATS server image tag - tag: 2.10.21-alpine3.20 + tag: 2.10.22-alpine3.20 ## @param nats.nats.advertise Bool to enable NATS server advertise advertise: false ## @param nats.nats.imagePullSecrets Image pull credentials for NATS server @@ -885,12 +895,6 @@ tfyBuild: preemptibleDeployment: ## @param tfyBuild.preemptibleDeployment.enabled Bool to enable preemptible deployment for the tfyBuild server enabled: false - ## tfyBuild.preemptibleDeployment.image - image: - ## @param tfyBuild.preemptibleDeployment.image.repository Repository for the preemptible deployment - repository: tfy.jfrog.io/tfy-mirror/alpine - ## @param tfyBuild.preemptibleDeployment.image.tag Tag for the preemptible deployment - tag: "3.20" ## @param tfyBuild.preemptibleDeployment.imagePullSecrets Image pull secrets for the preemptible deployment imagePullSecrets: [] ## @param tfyBuild.preemptibleDeployment.affinity Affinity settings for the preemptible deployment @@ -919,7 +923,7 @@ tfyBuild: ## imagePullSecrets: [] ## @param tfyBuild.truefoundryWorkflows.sfyBuilder.baseImagePullSecret baseImagePullSecret for the docker config - baseImagePullSecret: "" + baseImagePullSecret: "truefoundry-image-pull-secret" ## @param tfyBuild.truefoundryWorkflows.sfyBuilder.script script for the sfyBuilder to be executed script: | download-code.sh @@ -994,11 +998,11 @@ tfyBuild: ## tfyBuild.truefoundryWorkflows.logMarkers Log markers for the tfyBuild server logMarkers: ## @param tfyBuild.truefoundryWorkflows.logMarkers.error Error log marker for the tfyBuild server - error: \u001b[31m[Error]\u001b[0m + error: \033[31m[Error]\033[0m ## @param tfyBuild.truefoundryWorkflows.logMarkers.done Done log marker for the tfyBuild server - done: \u001b[32m[Done]\u001b[0m + done: \033[32m[Done]\033[0m ## @param tfyBuild.truefoundryWorkflows.logMarkers.start Start log marker for the tfyBuild server - start: \u001b[36m[Start]\u001b[0m + start: \033[36m[Start]\033[0m ## @param tfyBuild.truefoundryWorkflows.logMarkers.clientPrefix Client prefix for the tfyBuild server clientPrefix: - TFY-CLIENT @@ -1034,16 +1038,22 @@ tfyBuild: ## tfy-buildkitd-service Settings corresponding to the tfy-buildkitd service tfy-buildkitd-service: ## @param tfy-buildkitd-service.enabled Bool to enable the tfy-buildkitd service - enabled: false + enabled: true ## tfy-buildkitd-service.service Service settings for the tfy-buildkitd service service: ## @param tfy-buildkitd-service.service.port port number for the tfy-buildkitd service port: 1234 ## @param tfy-buildkitd-service.replicaCount Number of replicas Value kept for future use, kept 1 replicaCount: 1 + ## @param tfy-buildkitd-service.podAnnotations Annotations for the tfy-buildkitd service pods + podAnnotations: + cluster-autoscaler.kubernetes.io/safe-to-evict: "false" + karpenter.sh/do-not-disrupt: 'true' + karpenter.sh/do-not-evict: 'true' + ## tfy-buildkitd-service.tls TLS settings for the tfy-buildkitd service tls: ## @param tfy-buildkitd-service.tls.enabled Enable TLS for the tfy-buildkitd service - enabled: false + enabled: true ## @param tfy-buildkitd-service.tls.buildkitClientCertsSecretName Name of the secret containing the TLS certificate buildkitClientCertsSecretName: "tfy-buildkit-client-certs" # To further configure the local postgres installation use the following section. @@ -1087,7 +1097,7 @@ tfyController: ## @param tfyController.image.repository Image repository for the tfyController repository: tfy.jfrog.io/tfy-private-images/tfy-controller ## @param tfyController.image.tag Image tag for the tfyController - tag: v0.7.0 + tag: v0.8.0 ## @param tfyController.environmentName Environment name for the tfyController environmentName: default ## @param tfyController.envSecretName Secret name for the tfyController environment variables